opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.textsimilarity;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;

 import org.apache.commons.collections.ListUtils;
 import org.apache.commons.lang3.StringUtils;

 import opennlp.tools.parse_thicket.ParseTreeNode;

 public class ParseTreeChunk {
 	private String mainPOS;

 	private List<String> lemmas;

 	private List<String> POSs;

 	private int startPos;

 	private int endPos;

 	private int size;

 	private ParseTreeMatcher parseTreeMatcher;

 	private LemmaFormManager lemmaFormManager;

 	private GeneralizationListReducer generalizationListReducer;

 	private List<ParseTreeNode> parseTreeNodes;


 	public List<ParseTreeNode> getParseTreeNodes() {
 		return parseTreeNodes;
 	}

 	public void setParseTreeNodes(List<ParseTreeNode> parseTreeNodes) {
 		this.parseTreeNodes = parseTreeNodes;
 	}

 	public ParseTreeChunk(){};
 	// "[<1>NP'Property':NN, <2>NP'has':VBZ, <3>NP'lots':NNS, <4>NP'of':IN, <5>NP'trash':NN, <6>NP'and':CC, <7>NP'debris':NN]";

 	public ParseTreeChunk(String phrStr){
 		String[] parts = phrStr.replace("]","").split(", <");
 		this.POSs = new ArrayList<String>();
 		this.lemmas = new ArrayList<String>();
 		this.mainPOS = StringUtils.substringBetween(phrStr, ">", "'");
 		for(String part: parts){
 			String lemma = StringUtils.substringBetween(part, "P'", "':");
 			String pos = part.substring(part.indexOf(":")+1, part.length());

 			if (pos==null || lemma ==null){
 				continue;
 			}
 			this.POSs.add(pos.trim());
 			this.lemmas.add(lemma.trim());
 		}

 	}

 	public ParseTreeChunk(List<String> lemmas, List<String> POSs, int startPos,
 			int endPos) {
 		this.lemmas = lemmas;
 		this.POSs = POSs;
 		this.startPos = startPos;
 		this.endPos = endPos;

 		// phraseType.put(0, "np");
 	}

 	// constructor which takes lemmas and POS as lists so that phrases can be
 	// conveniently specified.
 	// usage: stand-alone runs
 	public ParseTreeChunk(String mPOS, String[] lemmas, String[] POSss) {
 		this.mainPOS = mPOS;
 		this.lemmas = new ArrayList<String>();
 		for (String l : lemmas) {
 			this.lemmas.add(l);
 		}
 		this.POSs = new ArrayList<String>();
 		for (String p : POSss) {
 			this.POSs.add(p);
 		}
 	}

 	// constructor which takes lemmas and POS as lists so that phrases can be
 	// conveniently specified.
 	// usage: stand-alone runs
 	public ParseTreeChunk(String mPOS, List<String> lemmas, List<String> POSss) {
 		this.mainPOS = mPOS;
 		this.lemmas = lemmas;
 		this.POSs = POSss;
 	}


 	public int getStartPos() {
 		return startPos;
 	}

 	public void setStartPos(int startPos) {
 		this.startPos = startPos;
 	}

 	public int getEndPos() {
 		return endPos;
 	}

 	public void setEndPos(int endPos) {
 		this.endPos = endPos;
 	}

 	public int getSize() {
 		return size;
 	}

 	public void setSize(int size) {
 		this.size = size;
 	}

 	public LemmaFormManager getLemmaFormManager() {
 		return lemmaFormManager;
 	}

 	public void setLemmaFormManager(LemmaFormManager lemmaFormManager) {
 		this.lemmaFormManager = lemmaFormManager;
 	}

 	public GeneralizationListReducer getGeneralizationListReducer() {
 		return generalizationListReducer;
 	}

 	public void setGeneralizationListReducer(
 			GeneralizationListReducer generalizationListReducer) {
 		this.generalizationListReducer = generalizationListReducer;
 	}

 	public void setParseTreeMatcher(ParseTreeMatcher parseTreeMatcher) {
 		this.parseTreeMatcher = parseTreeMatcher;
 	}

 	public  ParseTreeChunk(List<ParseTreeNode> ps) {
 		this.lemmas = new ArrayList<String>();
 		this.POSs = new ArrayList<String>();
 		for(ParseTreeNode n: ps){
 			this.lemmas.add(n.getWord());
 			this.POSs.add(n.getPos());
 		}

 		if (ps.size()>0){
 			this.setMainPOS(ps.get(0).getPhraseType());
 			this.parseTreeNodes = ps;
 		}
 	}

 	public List<ParseTreeChunk> buildChunks(List<LemmaPair> parseResults) {
 		List<ParseTreeChunk> chunksResults = new ArrayList<ParseTreeChunk>();
 		for (LemmaPair chunk : parseResults) {
 			String[] lemmasAr = chunk.getLemma().split(" ");
 			List<String> poss = new ArrayList<String>(), lems = new ArrayList<String>();
 			for (String lem : lemmasAr) {
 				lems.add(lem);
 				// now looking for POSs for individual word
 				for (LemmaPair chunkCur : parseResults) {
 					if (chunkCur.getLemma().equals(lem)
 							&&
 							// check that this is a proper word in proper position
 							chunkCur.getEndPos() <= chunk.getEndPos()
 							&& chunkCur.getStartPos() >= chunk.getStartPos()) {
 						poss.add(chunkCur.getPOS());
 						break;
 					}
 				}
 			}
 			if (lems.size() != poss.size()) {
 				System.err.println("lems.size()!= poss.size()");
 			}
 			if (lems.size() < 2) { // single word phrase, nothing to match
 				continue;
 			}
 			ParseTreeChunk ch = new ParseTreeChunk(lems, poss, chunk.getStartPos(),
 					chunk.getEndPos());
 			ch.setMainPOS(chunk.getPOS());
 			chunksResults.add(ch);
 		}
 		return chunksResults;
 	}

 	public List<List<ParseTreeChunk>> matchTwoSentencesGivenPairLists(
 			List<LemmaPair> sent1Pairs, List<LemmaPair> sent2Pairs) {

 		List<ParseTreeChunk> chunk1List = buildChunks(sent1Pairs);
 		List<ParseTreeChunk> chunk2List = buildChunks(sent2Pairs);

 		List<List<ParseTreeChunk>> sent1GrpLst = groupChunksAsParses(chunk1List);
 		List<List<ParseTreeChunk>> sent2GrpLst = groupChunksAsParses(chunk2List);

 		System.out.println("=== Grouped chunks 1 " + sent1GrpLst);
 		System.out.println("=== Grouped chunks 2 " + sent2GrpLst);

 		return matchTwoSentencesGroupedChunks(sent1GrpLst, sent2GrpLst);
 	}

 	// groups noun phrases, verb phrases, propos phrases etc. for separate match

 	public List<List<ParseTreeChunk>> groupChunksAsParses(
 			List<ParseTreeChunk> parseResults) {
 		List<ParseTreeChunk> np = new ArrayList<ParseTreeChunk>(), vp = new ArrayList<ParseTreeChunk>(), prp = new ArrayList<ParseTreeChunk>(), sbarp = new ArrayList<ParseTreeChunk>(), pp = new ArrayList<ParseTreeChunk>(), adjp = new ArrayList<ParseTreeChunk>(), whadvp = new ArrayList<ParseTreeChunk>(), restOfPhrasesTypes = new ArrayList<ParseTreeChunk>();
 		List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
 		for (ParseTreeChunk ch : parseResults) {
 			String mainPos = ch.getMainPOS().toLowerCase();

 			if (mainPos.equals("s")) {
 				continue;
 			}
 			if (mainPos.equals("np")) {
 				np.add(ch);
 			} else if (mainPos.equals("vp")) {
 				vp.add(ch);
 			} else if (mainPos.equals("prp")) {
 				prp.add(ch);
 			} else if (mainPos.equals("pp")) {
 				pp.add(ch);
 			} else if (mainPos.equals("adjp")) {
 				adjp.add(ch);
 			} else if (mainPos.equals("whadvp")) {
 				whadvp.add(ch);
 			} else if (mainPos.equals("sbar")) {
 				sbarp.add(ch);
 			} else {
 				restOfPhrasesTypes.add(ch);
 			}

 		}
 		results.add(np);
 		results.add(vp);
 		results.add(prp);
 		results.add(pp);
 		results.add(adjp);
 		results.add(whadvp);
 		results.add(restOfPhrasesTypes);

 		return results;

 	}

 	// main function to generalize two expressions grouped by phrase types
 	// returns a list of generalizations for each phrase type with filtered
 	// sub-expressions
 	public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunks(
 			List<List<ParseTreeChunk>> sent1, List<List<ParseTreeChunk>> sent2) {
 		List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
 		// first irerate through component
 		for (int comp = 0; comp < 2 && // just np & vp
 				comp < sent1.size() && comp < sent2.size(); comp++) {
 			List<ParseTreeChunk> resultComps = new ArrayList<ParseTreeChunk>();
 			// then iterate through each phrase in each component
 			for (ParseTreeChunk ch1 : sent1.get(comp)) {
 				for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version
 					ParseTreeChunk chunkToAdd = parseTreeMatcher
 							.generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
 									ch1, ch2);

 					if (!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) {
 						continue; // if the words which have to stay do not stay, proceed to
 						// other elements
 					}
 					Boolean alreadyThere = false;
 					for (ParseTreeChunk chunk : resultComps) {
 						if (chunk.equalsTo(chunkToAdd)) {
 							alreadyThere = true;
 							break;
 						}

 						if (parseTreeMatcher
 								.generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
 										chunkToAdd).equalsTo(chunkToAdd)) {
 							alreadyThere = true;
 							break;
 						}
 					}

 					if (!alreadyThere) {
 						resultComps.add(chunkToAdd);
 					}

 					List<ParseTreeChunk> resultCompsReduced = generalizationListReducer
 							.applyFilteringBySubsumption(resultComps);
 					// if (resultCompsReduced.size() != resultComps.size())
 						// System.out.println("reduction of gen list occurred");
 				}
 			}
 			results.add(resultComps);
 		}

 		return results;
 	}

 /*	public Boolean equals(ParseTreeChunk ch) {
 		List<String> lems = ch.getLemmas();
 		List<String> poss = ch.POSs;

 		if (this.lemmas.size() <= lems.size())
 			return false; // sub-chunk should be shorter than chunk

 		for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
 			if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
 					poss.get(i))))
 				return false;
 		}
 		return true;
 	}
 */
 	// 'this' is super - chunk of ch, ch is sub-chunk of 'this'
 	public Boolean isASubChunk_OLD(ParseTreeChunk ch) {
 		List<String> lems = ch.getLemmas();
 		List<String> poss = ch.POSs;

 		if (this.lemmas.size() < lems.size())
 			return false; // sub-chunk should be shorter than chunk

 		for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
 			if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
 					poss.get(i))))
 				return false;
 		}
 		return true;
 	}

 	// this => value   ch => *
 	public Boolean isASubChunk(ParseTreeChunk ch) {
 		List<String> lems = ch.getLemmas();
 		List<String> poss = ch.POSs;

 		if (this.lemmas.size() < lems.size())
 			return false; // sub-chunk should be shorter than chunk

 		Boolean notSubChunkWithGivenAlignment = false, unComparable = false;

 		for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
 			// both lemma and pos are different
 			if (!this.POSs.get(i).equals(poss.get(i)) && !this.lemmas.get(i).equals(lems.get(i)) ){
 				unComparable = true;
 				break;
 			}

 			// this => *  ch=> run
 			if (!this.lemmas.get(i).equals(lems.get(i)) && this.lemmas.get(i).equals("*"))
 				notSubChunkWithGivenAlignment = true;
 		}
 		if (!notSubChunkWithGivenAlignment && !unComparable)
 			return true;

 		List<String> thisPOS = new ArrayList<String> ( this.POSs);
 		Collections.reverse(thisPOS);
 		List<String> chPOS = new ArrayList<String> ( poss);
 		Collections.reverse(chPOS);
 		List<String> thisLemma = new ArrayList<String> ( this.lemmas);
 		Collections.reverse(thisLemma );
 		List<String> chLemma = new ArrayList<String> ( lems);
 		Collections.reverse(chLemma);

 		notSubChunkWithGivenAlignment = false; unComparable = false;
 		for (int i = lems.size()-1 ; i>=0; i--) {
 			// both lemma and pos are different
 			if (!thisPOS.get(i).equals(chPOS.get(i)) && !thisLemma.get(i).equals(chLemma.get(i)) ){
 				unComparable = true;
 				break;
 			}

 			// this => *  ch=> run
 			if (!thisLemma.get(i).equals(chLemma.get(i)) && thisLemma.get(i).equals("*"))
 				notSubChunkWithGivenAlignment = true;
 		}

 		if (!notSubChunkWithGivenAlignment && !unComparable)
 			return true;
 		else
 			return false; // then ch is redundant and needs to be removed
 	}

 	public Boolean equalsTo(ParseTreeChunk ch) {
 		List<String> lems = ch.getLemmas();
 		List<String> poss = ch.POSs;
 		if (this.lemmas.size() != lems.size() || this.POSs.size() != poss.size())
 			return false;

 		for (int i = 0; i < lems.size(); i++) {
 			if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
 					poss.get(i))))
 				return false;
 		}

 		return true;
 	}

 	public boolean equals(ParseTreeChunk ch) {
 		List<String> lems = ch.getLemmas();
 		List<String> poss = ch.POSs;
 		return ListUtils.isEqualList(ch.getLemmas(), this.lemmas) && ListUtils.isEqualList(ch.getPOSs(), this.POSs);
 	}

 	public String toString() {
 		String buf = " [";
 		if (mainPOS != null)
 			buf = mainPOS + " [";
 		for (int i = 0; i < lemmas.size() && i < POSs.size() ; i++) {
 			buf += POSs.get(i) + "-" + lemmas.get(i) + " ";
 			if (this.parseTreeNodes!=null){
 				Map<String, Object> attrs = this.parseTreeNodes.get(i).getAttributes();
 				if (attrs!=null && attrs.keySet().size()>0){
 					buf += attrs+ " ";
 				}
 				String ner =this.parseTreeNodes.get(i).getNe();
 				if (ner!=null && ner.length()>1)
 					buf+="("+ner+ ") ";
 			}
 		}
 		return buf + "]";
 	}

 	public String toWordOnlyString(){
 		String buf = "";

 		for (int i = 0; i < lemmas.size()  ; i++) {
 			buf+=lemmas.get(i)+" ";
 		}
 		return buf.trim();
 	}

 	public int compareTo(ParseTreeChunk o) {
 		if (this.size > o.size)
 			return -1;
 		else
 			return 1;

 	}

 	public String listToString(List<List<ParseTreeChunk>> chunks) {
 		StringBuffer buf = new StringBuffer();
 		if (chunks.get(0).size() > 0) {
 			buf.append(" np " + chunks.get(0).toString());
 		}
 		if (chunks.get(1).size() > 0) {
 			buf.append(" vp " + chunks.get(1).toString());
 		}
 		if (chunks.size() < 3) {
 			return buf.toString();
 		}
 		if (chunks.get(2).size() > 0) {
 			buf.append(" prp " + chunks.get(2).toString());
 		}
 		if (chunks.get(3).size() > 0) {
 			buf.append(" pp " + chunks.get(3).toString());
 		}
 		if (chunks.get(4).size() > 0) {
 			buf.append(" adjp " + chunks.get(4).toString());
 		}
 		if (chunks.get(5).size() > 0) {
 			buf.append(" whadvp " + chunks.get(5).toString());
 		}
 		/*
 		 * if (mainPos.equals("np")) np.add(ch); else if (mainPos.equals( "vp"))
 		 * vp.add(ch); else if (mainPos.equals( "prp")) prp.add(ch); else if
 		 * (mainPos.equals( "pp")) pp.add(ch); else if (mainPos.equals( "adjp"))
 		 * adjp.add(ch); else if (mainPos.equals( "whadvp")) whadvp.add(ch);
 		 */
 		return buf.toString();
 	}

 	public List<List<ParseTreeChunk>> obtainParseTreeChunkListByParsingList(
 			String toParse) {
 		List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
 		// if (toParse.endsWith("]]]")){
 		// toParse = toParse.replace("[[","").replace("]]","");
 		// }
 		toParse = toParse.replace(" ]], [ [", "&");
 		String[] phraseTypeFragments = toParse.trim().split("&");
 		for (String toParseFragm : phraseTypeFragments) {
 			toParseFragm = toParseFragm.replace("],  [", "#");

 			List<ParseTreeChunk> resultsPhraseType = new ArrayList<ParseTreeChunk>();
 			String[] indivChunks = toParseFragm.trim().split("#");
 			for (String expr : indivChunks) {
 				List<String> lems = new ArrayList<String>(), poss = new ArrayList<String>();
 				expr = expr.replace("[", "").replace(" ]", "");
 				String[] pairs = expr.trim().split(" ");
 				for (String word : pairs) {
 					word = word.replace("]]", "").replace("]", "");
 					String[] pos_lem = word.split("-");
 					lems.add(pos_lem[1].trim());
 					poss.add(pos_lem[0].trim());
 				}
 				ParseTreeChunk ch = new ParseTreeChunk();
 				ch.setLemmas(lems);
 				ch.setPOSs(poss);
 				resultsPhraseType.add(ch);
 			}
 			results.add(resultsPhraseType);
 		}
 		System.out.println(results);
 		return results;

 		// 2.1 | Vietnam <b>embassy</b> <b>in</b> <b>Israel</b>: information on how
 		// to get your <b>visa</b> at Vietnam
 		// <b>embassy</b> <b>in</b> <b>Israel</b>. <b>...</b> <b>Spain</b>.
 		// Scotland. Sweden. Slovakia. Switzerland. T
 		// [Top of Page] <b>...</b>
 		// [[ [NN-* IN-in NP-israel ], [NP-* IN-in NP-israel ], [NP-* IN-* TO-* NN-*
 		// ], [NN-visa IN-* NN-* IN-in ]], [
 		// [VB-get NN-visa IN-* NN-* IN-in .-* ], [VBD-* IN-* NN-* NN-* .-* ], [VB-*
 		// NP-* ]]]

 	}

 	public void setMainPOS(String mainPOS) {
 		this.mainPOS = mainPOS;
 	}

 	public String getMainPOS() {
 		return mainPOS;
 	}

 	public List<String> getLemmas() {
 		return lemmas;
 	}

 	public void setLemmas(List<String> lemmas) {
 		this.lemmas = lemmas;
 	}

 	public List<String> getPOSs() {
 		return POSs;
 	}

 	public void setPOSs(List<String> pOSs) {
 		POSs = pOSs;
 	}

 	public ParseTreeMatcher getParseTreeMatcher() {
 		return parseTreeMatcher;
 	}

 	public static void main(String[] args){
 		String phrStr = "[<1>NP'Property':NN, <2>NP'has':VBZ, <3>NP'lots':NNS, <4>NP'of':IN, <5>NP'trash':NN, <6>NP'and':CC, <7>NP'debris':NN]";
 	    ParseTreeChunk ch = new ParseTreeChunk(phrStr);
 	    System.out.println(ch);
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.textsimilarity;

	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;
	import java.util.Map;

	import org.apache.commons.collections.ListUtils;
	import org.apache.commons.lang3.StringUtils;

	import opennlp.tools.parse_thicket.ParseTreeNode;

	public class ParseTreeChunk {
	private String mainPOS;

	private List<String> lemmas;

	private List<String> POSs;

	private int startPos;

	private int endPos;

	private int size;

	private ParseTreeMatcher parseTreeMatcher;

	private LemmaFormManager lemmaFormManager;

	private GeneralizationListReducer generalizationListReducer;

	private List<ParseTreeNode> parseTreeNodes;


	public List<ParseTreeNode> getParseTreeNodes() {
	return parseTreeNodes;
	}

	public void setParseTreeNodes(List<ParseTreeNode> parseTreeNodes) {
	this.parseTreeNodes = parseTreeNodes;
	}

	public ParseTreeChunk(){};
	// "[<1>NP'Property':NN, <2>NP'has':VBZ, <3>NP'lots':NNS, <4>NP'of':IN, <5>NP'trash':NN, <6>NP'and':CC, <7>NP'debris':NN]";

	public ParseTreeChunk(String phrStr){
	String[] parts = phrStr.replace("]","").split(", <");
	this.POSs = new ArrayList<String>();
	this.lemmas = new ArrayList<String>();
	this.mainPOS = StringUtils.substringBetween(phrStr, ">", "'");
	for(String part: parts){
	String lemma = StringUtils.substringBetween(part, "P'", "':");
	String pos = part.substring(part.indexOf(":")+1, part.length());

	if (pos==null \|\| lemma ==null){
	continue;
	}
	this.POSs.add(pos.trim());
	this.lemmas.add(lemma.trim());
	}

	}

	public ParseTreeChunk(List<String> lemmas, List<String> POSs, int startPos,
	int endPos) {
	this.lemmas = lemmas;
	this.POSs = POSs;
	this.startPos = startPos;
	this.endPos = endPos;

	// phraseType.put(0, "np");
	}

	// constructor which takes lemmas and POS as lists so that phrases can be
	// conveniently specified.
	// usage: stand-alone runs
	public ParseTreeChunk(String mPOS, String[] lemmas, String[] POSss) {
	this.mainPOS = mPOS;
	this.lemmas = new ArrayList<String>();
	for (String l : lemmas) {
	this.lemmas.add(l);
	}
	this.POSs = new ArrayList<String>();
	for (String p : POSss) {
	this.POSs.add(p);
	}
	}

	// constructor which takes lemmas and POS as lists so that phrases can be
	// conveniently specified.
	// usage: stand-alone runs
	public ParseTreeChunk(String mPOS, List<String> lemmas, List<String> POSss) {
	this.mainPOS = mPOS;
	this.lemmas = lemmas;
	this.POSs = POSss;
	}


	public int getStartPos() {
	return startPos;
	}

	public void setStartPos(int startPos) {
	this.startPos = startPos;
	}

	public int getEndPos() {
	return endPos;
	}

	public void setEndPos(int endPos) {
	this.endPos = endPos;
	}

	public int getSize() {
	return size;
	}

	public void setSize(int size) {
	this.size = size;
	}

	public LemmaFormManager getLemmaFormManager() {
	return lemmaFormManager;
	}

	public void setLemmaFormManager(LemmaFormManager lemmaFormManager) {
	this.lemmaFormManager = lemmaFormManager;
	}

	public GeneralizationListReducer getGeneralizationListReducer() {
	return generalizationListReducer;
	}

	public void setGeneralizationListReducer(
	GeneralizationListReducer generalizationListReducer) {
	this.generalizationListReducer = generalizationListReducer;
	}

	public void setParseTreeMatcher(ParseTreeMatcher parseTreeMatcher) {
	this.parseTreeMatcher = parseTreeMatcher;
	}

	public ParseTreeChunk(List<ParseTreeNode> ps) {
	this.lemmas = new ArrayList<String>();
	this.POSs = new ArrayList<String>();
	for(ParseTreeNode n: ps){
	this.lemmas.add(n.getWord());
	this.POSs.add(n.getPos());
	}

	if (ps.size()>0){
	this.setMainPOS(ps.get(0).getPhraseType());
	this.parseTreeNodes = ps;
	}
	}

	public List<ParseTreeChunk> buildChunks(List<LemmaPair> parseResults) {
	List<ParseTreeChunk> chunksResults = new ArrayList<ParseTreeChunk>();
	for (LemmaPair chunk : parseResults) {
	String[] lemmasAr = chunk.getLemma().split(" ");
	List<String> poss = new ArrayList<String>(), lems = new ArrayList<String>();
	for (String lem : lemmasAr) {
	lems.add(lem);
	// now looking for POSs for individual word
	for (LemmaPair chunkCur : parseResults) {
	if (chunkCur.getLemma().equals(lem)
	&&
	// check that this is a proper word in proper position
	chunkCur.getEndPos() <= chunk.getEndPos()
	&& chunkCur.getStartPos() >= chunk.getStartPos()) {
	poss.add(chunkCur.getPOS());
	break;
	}
	}
	}
	if (lems.size() != poss.size()) {
	System.err.println("lems.size()!= poss.size()");
	}
	if (lems.size() < 2) { // single word phrase, nothing to match
	continue;
	}
	ParseTreeChunk ch = new ParseTreeChunk(lems, poss, chunk.getStartPos(),
	chunk.getEndPos());
	ch.setMainPOS(chunk.getPOS());
	chunksResults.add(ch);
	}
	return chunksResults;
	}

	public List<List<ParseTreeChunk>> matchTwoSentencesGivenPairLists(
	List<LemmaPair> sent1Pairs, List<LemmaPair> sent2Pairs) {

	List<ParseTreeChunk> chunk1List = buildChunks(sent1Pairs);
	List<ParseTreeChunk> chunk2List = buildChunks(sent2Pairs);

	List<List<ParseTreeChunk>> sent1GrpLst = groupChunksAsParses(chunk1List);
	List<List<ParseTreeChunk>> sent2GrpLst = groupChunksAsParses(chunk2List);

	System.out.println("=== Grouped chunks 1 " + sent1GrpLst);
	System.out.println("=== Grouped chunks 2 " + sent2GrpLst);

	return matchTwoSentencesGroupedChunks(sent1GrpLst, sent2GrpLst);
	}

	// groups noun phrases, verb phrases, propos phrases etc. for separate match

	public List<List<ParseTreeChunk>> groupChunksAsParses(
	List<ParseTreeChunk> parseResults) {
	List<ParseTreeChunk> np = new ArrayList<ParseTreeChunk>(), vp = new ArrayList<ParseTreeChunk>(), prp = new ArrayList<ParseTreeChunk>(), sbarp = new ArrayList<ParseTreeChunk>(), pp = new ArrayList<ParseTreeChunk>(), adjp = new ArrayList<ParseTreeChunk>(), whadvp = new ArrayList<ParseTreeChunk>(), restOfPhrasesTypes = new ArrayList<ParseTreeChunk>();
	List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
	for (ParseTreeChunk ch : parseResults) {
	String mainPos = ch.getMainPOS().toLowerCase();

	if (mainPos.equals("s")) {
	continue;
	}
	if (mainPos.equals("np")) {
	np.add(ch);
	} else if (mainPos.equals("vp")) {
	vp.add(ch);
	} else if (mainPos.equals("prp")) {
	prp.add(ch);
	} else if (mainPos.equals("pp")) {
	pp.add(ch);
	} else if (mainPos.equals("adjp")) {
	adjp.add(ch);
	} else if (mainPos.equals("whadvp")) {
	whadvp.add(ch);
	} else if (mainPos.equals("sbar")) {
	sbarp.add(ch);
	} else {
	restOfPhrasesTypes.add(ch);
	}

	}
	results.add(np);
	results.add(vp);
	results.add(prp);
	results.add(pp);
	results.add(adjp);
	results.add(whadvp);
	results.add(restOfPhrasesTypes);

	return results;

	}

	// main function to generalize two expressions grouped by phrase types
	// returns a list of generalizations for each phrase type with filtered
	// sub-expressions
	public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunks(
	List<List<ParseTreeChunk>> sent1, List<List<ParseTreeChunk>> sent2) {
	List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
	// first irerate through component
	for (int comp = 0; comp < 2 && // just np & vp
	comp < sent1.size() && comp < sent2.size(); comp++) {
	List<ParseTreeChunk> resultComps = new ArrayList<ParseTreeChunk>();
	// then iterate through each phrase in each component
	for (ParseTreeChunk ch1 : sent1.get(comp)) {
	for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version
	ParseTreeChunk chunkToAdd = parseTreeMatcher
	.generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
	ch1, ch2);

	if (!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) {
	continue; // if the words which have to stay do not stay, proceed to
	// other elements
	}
	Boolean alreadyThere = false;
	for (ParseTreeChunk chunk : resultComps) {
	if (chunk.equalsTo(chunkToAdd)) {
	alreadyThere = true;
	break;
	}

	if (parseTreeMatcher
	.generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
	chunkToAdd).equalsTo(chunkToAdd)) {
	alreadyThere = true;
	break;
	}
	}

	if (!alreadyThere) {
	resultComps.add(chunkToAdd);
	}

	List<ParseTreeChunk> resultCompsReduced = generalizationListReducer
	.applyFilteringBySubsumption(resultComps);
	// if (resultCompsReduced.size() != resultComps.size())
	// System.out.println("reduction of gen list occurred");
	}
	}
	results.add(resultComps);
	}

	return results;
	}

	/* public Boolean equals(ParseTreeChunk ch) {
	List<String> lems = ch.getLemmas();
	List<String> poss = ch.POSs;

	if (this.lemmas.size() <= lems.size())
	return false; // sub-chunk should be shorter than chunk

	for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
	if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
	poss.get(i))))
	return false;
	}
	return true;
	}
	*/
	// 'this' is super - chunk of ch, ch is sub-chunk of 'this'
	public Boolean isASubChunk_OLD(ParseTreeChunk ch) {
	List<String> lems = ch.getLemmas();
	List<String> poss = ch.POSs;

	if (this.lemmas.size() < lems.size())
	return false; // sub-chunk should be shorter than chunk

	for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
	if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
	poss.get(i))))
	return false;
	}
	return true;
	}

	// this => value ch => *
	public Boolean isASubChunk(ParseTreeChunk ch) {
	List<String> lems = ch.getLemmas();
	List<String> poss = ch.POSs;

	if (this.lemmas.size() < lems.size())
	return false; // sub-chunk should be shorter than chunk

	Boolean notSubChunkWithGivenAlignment = false, unComparable = false;

	for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
	// both lemma and pos are different
	if (!this.POSs.get(i).equals(poss.get(i)) && !this.lemmas.get(i).equals(lems.get(i)) ){
	unComparable = true;
	break;
	}

	// this => * ch=> run
	if (!this.lemmas.get(i).equals(lems.get(i)) && this.lemmas.get(i).equals("*"))
	notSubChunkWithGivenAlignment = true;
	}
	if (!notSubChunkWithGivenAlignment && !unComparable)
	return true;

	List<String> thisPOS = new ArrayList<String> ( this.POSs);
	Collections.reverse(thisPOS);
	List<String> chPOS = new ArrayList<String> ( poss);
	Collections.reverse(chPOS);
	List<String> thisLemma = new ArrayList<String> ( this.lemmas);
	Collections.reverse(thisLemma );
	List<String> chLemma = new ArrayList<String> ( lems);
	Collections.reverse(chLemma);

	notSubChunkWithGivenAlignment = false; unComparable = false;
	for (int i = lems.size()-1 ; i>=0; i--) {
	// both lemma and pos are different
	if (!thisPOS.get(i).equals(chPOS.get(i)) && !thisLemma.get(i).equals(chLemma.get(i)) ){
	unComparable = true;
	break;
	}

	// this => * ch=> run
	if (!thisLemma.get(i).equals(chLemma.get(i)) && thisLemma.get(i).equals("*"))
	notSubChunkWithGivenAlignment = true;
	}

	if (!notSubChunkWithGivenAlignment && !unComparable)
	return true;
	else
	return false; // then ch is redundant and needs to be removed
	}

	public Boolean equalsTo(ParseTreeChunk ch) {
	List<String> lems = ch.getLemmas();
	List<String> poss = ch.POSs;
	if (this.lemmas.size() != lems.size() \|\| this.POSs.size() != poss.size())
	return false;

	for (int i = 0; i < lems.size(); i++) {
	if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
	poss.get(i))))
	return false;
	}

	return true;
	}

	public boolean equals(ParseTreeChunk ch) {
	List<String> lems = ch.getLemmas();
	List<String> poss = ch.POSs;
	return ListUtils.isEqualList(ch.getLemmas(), this.lemmas) && ListUtils.isEqualList(ch.getPOSs(), this.POSs);
	}

	public String toString() {
	String buf = " [";
	if (mainPOS != null)
	buf = mainPOS + " [";
	for (int i = 0; i < lemmas.size() && i < POSs.size() ; i++) {
	buf += POSs.get(i) + "-" + lemmas.get(i) + " ";
	if (this.parseTreeNodes!=null){
	Map<String, Object> attrs = this.parseTreeNodes.get(i).getAttributes();
	if (attrs!=null && attrs.keySet().size()>0){
	buf += attrs+ " ";
	}
	String ner =this.parseTreeNodes.get(i).getNe();
	if (ner!=null && ner.length()>1)
	buf+="("+ner+ ") ";
	}
	}
	return buf + "]";
	}

	public String toWordOnlyString(){
	String buf = "";

	for (int i = 0; i < lemmas.size() ; i++) {
	buf+=lemmas.get(i)+" ";
	}
	return buf.trim();
	}

	public int compareTo(ParseTreeChunk o) {
	if (this.size > o.size)
	return -1;
	else
	return 1;

	}

	public String listToString(List<List<ParseTreeChunk>> chunks) {
	StringBuffer buf = new StringBuffer();
	if (chunks.get(0).size() > 0) {
	buf.append(" np " + chunks.get(0).toString());
	}
	if (chunks.get(1).size() > 0) {
	buf.append(" vp " + chunks.get(1).toString());
	}
	if (chunks.size() < 3) {
	return buf.toString();
	}
	if (chunks.get(2).size() > 0) {
	buf.append(" prp " + chunks.get(2).toString());
	}
	if (chunks.get(3).size() > 0) {
	buf.append(" pp " + chunks.get(3).toString());
	}
	if (chunks.get(4).size() > 0) {
	buf.append(" adjp " + chunks.get(4).toString());
	}
	if (chunks.get(5).size() > 0) {
	buf.append(" whadvp " + chunks.get(5).toString());
	}
	/*
	* if (mainPos.equals("np")) np.add(ch); else if (mainPos.equals( "vp"))
	* vp.add(ch); else if (mainPos.equals( "prp")) prp.add(ch); else if
	* (mainPos.equals( "pp")) pp.add(ch); else if (mainPos.equals( "adjp"))
	* adjp.add(ch); else if (mainPos.equals( "whadvp")) whadvp.add(ch);
	*/
	return buf.toString();
	}

	public List<List<ParseTreeChunk>> obtainParseTreeChunkListByParsingList(
	String toParse) {
	List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
	// if (toParse.endsWith("]]]")){
	// toParse = toParse.replace("[[","").replace("]]","");
	// }
	toParse = toParse.replace(" ]], [ [", "&");
	String[] phraseTypeFragments = toParse.trim().split("&");
	for (String toParseFragm : phraseTypeFragments) {
	toParseFragm = toParseFragm.replace("], [", "#");

	List<ParseTreeChunk> resultsPhraseType = new ArrayList<ParseTreeChunk>();
	String[] indivChunks = toParseFragm.trim().split("#");
	for (String expr : indivChunks) {
	List<String> lems = new ArrayList<String>(), poss = new ArrayList<String>();
	expr = expr.replace("[", "").replace(" ]", "");
	String[] pairs = expr.trim().split(" ");
	for (String word : pairs) {
	word = word.replace("]]", "").replace("]", "");
	String[] pos_lem = word.split("-");
	lems.add(pos_lem[1].trim());
	poss.add(pos_lem[0].trim());
	}
	ParseTreeChunk ch = new ParseTreeChunk();
	ch.setLemmas(lems);
	ch.setPOSs(poss);
	resultsPhraseType.add(ch);
	}
	results.add(resultsPhraseType);
	}
	System.out.println(results);
	return results;

	// 2.1 \| Vietnam <b>embassy</b> <b>in</b> <b>Israel</b>: information on how
	// to get your <b>visa</b> at Vietnam
	// <b>embassy</b> <b>in</b> <b>Israel</b>. <b>...</b> <b>Spain</b>.
	// Scotland. Sweden. Slovakia. Switzerland. T
	// [Top of Page] <b>...</b>
	// [[ [NN-* IN-in NP-israel ], [NP-* IN-in NP-israel ], [NP-* IN-* TO-* NN-*
	// ], [NN-visa IN-* NN-* IN-in ]], [
	// [VB-get NN-visa IN-* NN-* IN-in .-* ], [VBD-* IN-* NN-* NN-* .-* ], [VB-*
	// NP-* ]]]

	}

	public void setMainPOS(String mainPOS) {
	this.mainPOS = mainPOS;
	}

	public String getMainPOS() {
	return mainPOS;
	}

	public List<String> getLemmas() {
	return lemmas;
	}

	public void setLemmas(List<String> lemmas) {
	this.lemmas = lemmas;
	}

	public List<String> getPOSs() {
	return POSs;
	}

	public void setPOSs(List<String> pOSs) {
	POSs = pOSs;
	}

	public ParseTreeMatcher getParseTreeMatcher() {
	return parseTreeMatcher;
	}

	public static void main(String[] args){
	String phrStr = "[<1>NP'Property':NN, <2>NP'has':VBZ, <3>NP'lots':NNS, <4>NP'of':IN, <5>NP'trash':NN, <6>NP'and':CC, <7>NP'debris':NN]";
	ParseTreeChunk ch = new ParseTreeChunk(phrStr);
	System.out.println(ch);
	}
	}