opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.parse_thicket.opinion_processor;

 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import org.apache.commons.lang3.StringUtils;
 import opennlp.tools.jsmlearning.ProfileReaderWriter;
 import opennlp.tools.similarity.apps.BingQueryRunner;
 import opennlp.tools.similarity.apps.HitBase;
 import opennlp.tools.similarity.apps.utils.ValueSortMap;
 import opennlp.tools.stemmer.PStemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.TextProcessor;

 public class LinguisticPhraseManager {
 	private Map<String, Integer> freq = new ConcurrentHashMap<String, Integer>();

 	// the purpose to init this static object is to show the path to resources
 	private static StopList stop = StopList.getInstance(new File(".").getAbsolutePath().replace(".","")+ "src/test/resources/");

 	// this list will be overwritten by the external synonyms.csv
 	private static String[][] synonymPairs = new String[][]{};
 	private PStemmer stemmer = new PStemmer();

 	private List<ParseTreeChunk> lingPhrases = new ArrayList<ParseTreeChunk>();
 	private List<String> standardizedTopics = new ArrayList<String>();
 	// map which shows for each ling phrase the list of ling phrases with the same head noun it belongs
 	private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>();

 	//  map which shows for each string phrase the list of ling phrases with the same head noun it belongs
 	private Map<String, List<ParseTreeChunk>> std_group = new ConcurrentHashMap<String, List<ParseTreeChunk>>();

 	private BingQueryRunner runner = new BingQueryRunner();
 	private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5
 	private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3;
 	// this function takes a log of a chain of the nodes of parse trees and builds their instances
 	// the phrases should only be VP or NP, otherwise an exception should be thrown


 	private String resourceDir;
 	public LinguisticPhraseManager(){
 		try {
 			resourceDir  = new File( "." ).getCanonicalPath()+"/src/main/resources/";
 			List<String[]> vocabs = ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv");
 			synonymPairs = new String[vocabs.size()][2];
 			int count = 0;
 			for(String[] line: vocabs){
 				try {
 					synonymPairs[count] = line;
 					count++;
 	            } catch (Exception e) {
 	                e.printStackTrace();
 	            }
 			}

 		} catch (Exception e) {
 			e.printStackTrace();
 		}
 	}

 	private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){
 		ParseTreeChunk ch = new ParseTreeChunk();
 		List<String> POSs = new ArrayList<String>(), lemmas = new ArrayList<String>();

 		String[] parts = phrStr.replace("]","").split(", <");

 		ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'"));
 		try {
 			for(String part: parts){
 				String lemma = StringUtils.substringBetween(part, "P'", "':").toLowerCase();
 				String pos = part.substring(part.indexOf(":")+1, part.length());

 				if (pos==null || lemma ==null){
 					continue;
 				}
 				POSs.add(pos.trim());
 				lemmas.add(lemma.trim());
 				ch.setPOSs(POSs); ch.setLemmas(lemmas);
 			}
 		} catch (Exception e) {
 			// we expect exceptions if extracted phrases are NEITHER NP nor VP
 			// empty chunk will be given which will not create a new topic
 			e.printStackTrace();
 		}

 		return ch;
 	}

 	// this is a constructor with an array of extraction files
 	// optimized for performance
 	// only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER times will be considered
 	public LinguisticPhraseManager(String[] loadPaths){
 		List<String[]> columns = new ArrayList<String[]>();
 		for(String file: loadPaths){
 			columns.addAll(ProfileReaderWriter.readProfiles( file));
 		}

 		for(String[] l: columns){
 			if (l.length<3 || l[1]==null || l[2]==null)
 				continue;
 			String word = l[1].toLowerCase().trim();
 			if (word.indexOf("=>")>-1)
 				continue;

 			word = isAcceptableStringPhrase(word);
 			if (word==null)
 				continue;

 			if (!freq.containsKey(word)) {
 				freq.put(word, 1);

 			} else {
 				freq.put(word, freq.get(word) + 1);
 				// once we reached the count for a topic, create it
 				if (freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){
 					ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
 					ch = isAcceptableLingPhrase(ch);
 					if (ch==null)
 						continue;
 					lingPhrases.add(ch);
 				}
 			}
 		}
 		// we dont need frequency data any more
 		freq.clear();
 	}

 	// this is a default constructor with a single topic extraction file
 	// not optimized for performance
 	public LinguisticPhraseManager(String loadPath){
 		List<String[]> columns = ProfileReaderWriter.readProfiles( loadPath);
 		for(String[] l: columns){
 			if (l.length<3 || l[1]==null || l[2]==null)
 				continue;
 			String word = l[1].toLowerCase().trim();
 			if (word.indexOf("=>")>-1)
 				continue;

 			word = isAcceptableStringPhrase(word);
 			if (word==null)
 				continue;

 			if (!freq.containsKey(word)) {

 				ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
 				ch = isAcceptableLingPhrase(ch);
 				if (ch==null)
 					continue;
 				freq.put(word, 1);
 				lingPhrases.add(ch);
 			} else {
 				freq.put(word, freq.get(word) + 1);
 			}


 		}
 		freq = ValueSortMap.sortMapByValue(freq, false);


 	}
 	// removing prepositions and articles in case it has not worked at phrase forming stage
 	private String isAcceptableStringPhrase(String word) {
 		if (word.startsWith("to "))
 			return null;
 		if (word.startsWith("a "))
 			return word.substring(2, word.length());

 		if (word.endsWith(" !") || word.endsWith(" ."))
 			return word.substring(0, word.length()-2).trim();

 		return word;
 	}
 	// we only accept NP
 	private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) {
 		if (!ch.getMainPOS().equals("NP"))
 			return null;


 		return ch;
 	}

 	// groups are sets of phrases with the same head noun
 	// put all phrases in a group. Have a map from each phrase to its group: the list of members
 	public void doLingGrouping(){
 		for(int i=0; i< lingPhrases.size(); i++){
 			for(int j=i+1; j< lingPhrases.size(); j++){
 				ParseTreeChunk chI = lingPhrases.get(i);
 				ParseTreeChunk chJ = lingPhrases.get(j);
 				if (chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1))
 						&& chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){
 					List<ParseTreeChunk> values = null;
 					if( chI.getLemmas().size()<chJ.getLemmas().size()){

 						if (values == null)
 							values = new ArrayList<ParseTreeChunk>();
 						values.add(chI);
 						entry_group.put(chJ, values);
 					} else {
 						values = entry_group.get(chI);
 						if (values == null)
 							values = new ArrayList<ParseTreeChunk>();
 						values.add(chJ);
 						entry_group.put(chI, values);
 					}
 				}
 			}
 		}


 	}

 	public List<String> formStandardizedTopic(){
 		Set<ParseTreeChunk> keys = entry_group.keySet();
 		for(ParseTreeChunk k: keys){
 			List<ParseTreeChunk> lingPhrases = entry_group.get(k);
 			for(int i=0; i< lingPhrases.size(); i++)
 				for(int j=i+1; j< lingPhrases.size(); j++){
 					ParseTreeChunk chI = lingPhrases.get(i);
 					ParseTreeChunk chJ = lingPhrases.get(j);
 					List<String> lemmas = new ArrayList<String>(chI.getLemmas());
 					lemmas.retainAll(chJ.getLemmas());
 					if (lemmas.size()<2)
 						continue;
 					String buf = ""; List<String> candTopicLst = new ArrayList<String>();
 					for(String w: lemmas){
 						if (w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER)
 							continue;
 						if (!StringUtils.isAlpha(w))
 							continue;
 						// find POS of w
 						boolean bAccept = false;
 						for(int iw=0; iw<chI.getLemmas().size(); iw++){
 							if (w.equals(chI.getLemmas().get(iw))){
 								if (chI.getPOSs().get(iw).startsWith("NN") || chI.getPOSs().get(iw).startsWith("JJ")
 										|| chI.getPOSs().get(iw).startsWith("VB"))
 									bAccept=true;
 							}
 						}
 						if (bAccept){
 							//buf+=w+" ";
 							String ws = substituteSynonym(w);
 							candTopicLst.add(ws);
 						}
 					}
 					// remove duplicates like 'new new house'
 					//candTopicLst = new ArrayList<String>(new HashSet<String>(candTopicLst));
 					for(String w: candTopicLst){
 						buf+=w+" ";
 					}

 					buf = buf.trim();
 					if (buf.indexOf(' ')<0)
 						continue;

 					if (!standardizedTopics.contains(buf)){
 						standardizedTopics.add(buf);
 						std_group.put(buf, lingPhrases);
 					}
 				}
 		}
 		cleanUpStandardizedTopics();

 		return standardizedTopics;
 	}

 	public void cleanUpStandardizedTopics(){
 		List<String> toDelete = new ArrayList<String>();
 		for(int i=0; i< standardizedTopics.size(); i++)
 			for(int j=i+1; j< standardizedTopics.size(); j++){
 				List<String> t1 = TextProcessor.fastTokenize(standardizedTopics.get(i), false);
 				List<String> t2 = TextProcessor.fastTokenize(standardizedTopics.get(j), false);
 				for(int k=0; k< t1.size(); k++){
 					t1.set(k, stemmer.stem(t1.get(k)));
 				}
 				for(int k=0; k< t2.size(); k++){
 					t2.set(k, stemmer.stem(t2.get(k)));
 				}
 				// check if lists are equal
 				if (t1.size()!=t2.size())
 					continue;
 				//if in two phrases once all keywords are tokenized, one phrase annihilates another,
 				t1.removeAll(t2);
 				if (t1.isEmpty()){
 					if (standardizedTopics.get(i).length()> standardizedTopics.get(j).length()){
 						toDelete.add(standardizedTopics.get(i));
 						// TODO update std_group entry
 						System.out.println("Removing '" + standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) );
 						List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
 						stJ.addAll(std_group.get(standardizedTopics.get(i)));
 						stJ = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ));
 						std_group.put(standardizedTopics.get(j), stJ);
 					}
 					else {
 						toDelete.add(standardizedTopics.get(j));
 						System.out.println("Removing '" + standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) );
 						List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
 						stI.addAll(std_group.get(standardizedTopics.get(j)));
 						stI = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI));
 						std_group.put(standardizedTopics.get(i), stI);
 					}

 				}
 			}
 		for(String d: toDelete){
 			//System.out.println("Removed '" + d + "'");
 			standardizedTopics.remove(d);
 		}
 	}

 	// substitute synonyms according to internal vocab
 	private String substituteSynonym(String w) {
 		try {
 			for(String[] pair: synonymPairs){
 				if (w.equals(pair[0]))
 					return pair[1];
 			}
 		} catch (Exception e) {
 			e.printStackTrace();
 		}
 		return w;
 	}

 	public void generateGroupingReport(String reportName){
 		List<String[]>  report = new ArrayList<String[]>();
 		Set<ParseTreeChunk> chs = entry_group.keySet();
 		report.add(new String[]{"string phrase" , "class", "linguistic phrase",  "list of ling phrases class representatives"});

 		for(ParseTreeChunk ch: chs){
 			String head = ch.getLemmas().get(ch.getLemmas().size()-1);
 			List<ParseTreeChunk> values = entry_group.get(ch);
 			if (values.size()<6)
 				head = "";
 			report.add(new String[]{ch.toWordOnlyString(), head,  ch.toString(),  values.toString()});
 		}
 		ProfileReaderWriter.writeReport(report, reportName);
 	}

 	//final merge floor-floors-flooring as head nound with phrase update
 	public void applyLastRoundOfAggregation(){
 		//merge <floor - floors - flooring>
 		/*
 			List<ParseTreeChunk> entries =  new ArrayList<ParseTreeChunk>(entry_group.keySet());
 			for(int i=0; i< entries.size(); i++){
 				for(int j=i+1; j< entries.size(); j++){
 					ParseTreeChunk chI = entries.get(i);
 					ParseTreeChunk chJ = entries.get(j);
 					String headI = getLastElement(chI.getLemmas());
 					String headJ = getLastElement(chJ.getLemmas());
 					if (headI==null || headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER  ||
 							headJ==null || headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER )
 						continue;

 					if (headI.indexOf(headJ)>-1){
 						//leave headJ
 						List<ParseTreeChunk> valuesToAddTo = entry_group.get(chJ);
 						List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chI);
 						if (valuesToAddTo==null || valuesBeingAdded == null)
 							continue;
 						valuesToAddTo.addAll(valuesBeingAdded);
 						entry_group.put(chJ, valuesToAddTo);
 						entry_group.remove(chI);
 						System.out.println("Deleting entry '"+ headI +"' and moving group to entry '"+ headJ +"'");
 					} else if (headJ.indexOf(headI)>-1){
 						//leave headJ
 						List<ParseTreeChunk> valuesToAddTo = entry_group.get(chI);
 						List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chJ);
 						if (valuesToAddTo==null || valuesBeingAdded == null)
 							continue;
 						valuesToAddTo.addAll(valuesBeingAdded);
 						entry_group.put(chI, valuesToAddTo);
 						entry_group.remove(chJ);
 						System.out.println("Deleting entry '"+ headJ +"' and moving group to entry '"+ headI +"'");
 					}

 				}
 			}
 		 */
 		for(int i = 0; i<standardizedTopics.size(); i++ )
 			for(int j = i+1; j<standardizedTopics.size(); j++ ){
 				String headI = extractHeadNounFromPhrase(standardizedTopics.get(i));
 				String headJ = extractHeadNounFromPhrase(standardizedTopics.get(j));
 				// if the same word do nothing
 				if (headI.equals(headJ))
 					continue;

 				//only if one is sub-word of another
 				if (headI.indexOf(headJ)>-1){

 					if (!properSubWordForm(headI, headJ))
 						continue;
 					//entry 'I' will be updated
 					String newKey = standardizedTopics.get(i).replace(headI, headJ);

 					List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
 					List<ParseTreeChunk> stInew = std_group.get(newKey);
 					//if (stInew!=null && !stInew.isEmpty())
 					//	stI.addAll(stInew);
 					if(stI==null)
 						continue;
 					std_group.put(newKey, stI);
 					std_group.remove(standardizedTopics.get(i));
 					System.out.println("Deleted entry for key '"+ standardizedTopics.get(i) +"' and created  '"+ newKey +"'");
 					standardizedTopics.set(i, newKey);

 				} else if (headJ.indexOf(headI)>-1){
 					if (!properSubWordForm(headJ, headI))
 						continue;
 					//entry 'J' will be updated
 					String newKey = standardizedTopics.get(j).replace(headJ, headI);

 					List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
 					List<ParseTreeChunk> stJnew = std_group.get(newKey);
 					//if (stJnew!=null && !stJnew.isEmpty())
 					//	stJ.addAll(stJnew);
 					if(stJ==null)
 						continue;
 					std_group.put(newKey, stJ);
 					std_group.remove(standardizedTopics.get(j));
 					System.out.println("Deleted entry for key '"+ standardizedTopics.get(j) +"' and created  '"+ newKey +"'");
 					standardizedTopics.set(j, newKey);
 				}
 			}


 	}

 	private boolean properSubWordForm(String headI, String headJ) {
 		String suffix = headI.replace(headJ, "");
 		if (suffix.equals("s") || suffix.equals("ing") //|| suffix.equals("er")
 				|| suffix.equals("rooms") ||
 				suffix.equals("") || suffix.equals("counter") ||
 				suffix.equals("room") || suffix.equals("back"))
 			return true;

 		//System.out.println("Wrong word '"+ headI + "'reduction into '" + headJ +"'");
 		return false;
 	}

 	//generates report
 	public void generateStdTopicReport(String reportName){
 		List<String[]>  report = new ArrayList<String[]>();
 		report.add(new String[]{"category", "topic", "sub-topics", "phrase instances" });

 		for(String t: standardizedTopics){

 			String bufCover = "";
 			int count = 0;
 			List<ParseTreeChunk> ptcList = std_group.get(t);
 			if (ptcList == null)
 				continue;
 			for(ParseTreeChunk ch: ptcList){
 				List<String> candidate = TextProcessor.fastTokenize(ch.toWordOnlyString(), false);
 				List<String> tList = TextProcessor.fastTokenize(t, false);
 				List<String> tListChk = new ArrayList<String>(tList);

 				tListChk.removeAll(candidate);
 				// fully covered by phrase instance
 				if (!tListChk.isEmpty() || ch.toWordOnlyString().equals(t)){
 					continue;
 				}

 				boolean bCovered = true;

 				for(String ts: tList){
 					boolean bCandWordsIsCovered = false;
 					for(String s: candidate){
 						if ((s.indexOf(ts)>-1) )//  && properSubWordForm(s, ts))
 							bCandWordsIsCovered = true;
 					}
 					if (!bCandWordsIsCovered){
 						bCovered = false;
 						break;
 					}
 				}
 				if (!bCovered)
 					continue;
 				bufCover+=ch.toWordOnlyString()+ " # ";
 				count++;
 				if (count > 40)
 					break;

 			}
 			if (bufCover.endsWith(" # "))
 				bufCover = bufCover.substring(0, bufCover.length()-3).trim();

 			String buf = "";
 			count = 0;
 			// only up to 40 instances of phrases per 1-st level topic
 			for(ParseTreeChunk ch: ptcList){
 				buf+=ch.toWordOnlyString()+ "|";
 				count++;
 				if (count > 40)
 					break;
 			}

 			//TODO uncomment
 			//t = spell.getSpellCheckResult(t);
 			report.add(new String[]{extractHeadNounFromPhrase(t), t, bufCover, buf //, std_group.get(t).toString()
 			});
 		}


 		ProfileReaderWriter.writeReport(report, reportName);
 	}
 	// get a last word from a phrase (supposed to be a head noun)
 	private String extractHeadNounFromPhrase(String topic){
 		String[] tops = topic.split(" ");
 		int len = tops.length;
 		if (len>1){
 			return tops[len-1];
 		}
 		else return topic;
 	}

 	// get last elem of a list
 	private String getLastElement(List<String> arrayList ){
 		if (arrayList != null && !arrayList.isEmpty()) {
 			return arrayList.get(arrayList.size()-1);
 		}
 		return null;
 	}
 	/*
 	 * Using Bing API to check if an extracted phrase can be found on the web, therefore is a meaningful phrase
 	 */
 	public List<String> verifyTopic(){
 		Set<String> phrases = freq.keySet();
 		List<String> approvedPhrases = new ArrayList<String>();
 		for(String p: phrases){
 			List<HitBase> hits = runner.runSearch("\""+p+"\"");
 			for(HitBase h: hits){
 				String lookup = h.getTitle() + " " + h.getAbstractText();
 				if (lookup.indexOf(p)>-1){
 					approvedPhrases.add(p);
 					break;
 				}
 			}
 		}
 		return approvedPhrases;
 	}

 	public Set<String> getPhraseLookup(){
 		return freq.keySet();
 	}

 	// using phrase frequency to filter phrases
 	public boolean isAcceptablePhrase(String phrase){
 		Integer count = freq.get(phrase.toLowerCase().trim());
 		if (count==null)
 			return false;

 		if (count>0 && count < 10000)
 			return true;
 		return false;
 	}

 	public static void main(String[] args){
 		LinguisticPhraseManager man = new  LinguisticPhraseManager(
 				"/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv");
 		man.doLingGrouping();
 		man.generateGroupingReport("topics_groups7_mergedHeads.csv");
 		List<String> stdTopics = man.formStandardizedTopic();
 		man.applyLastRoundOfAggregation();
 		man.generateStdTopicReport("std_topics7_mergedHeads.csv");
 		System.out.println(stdTopics);

 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.parse_thicket.opinion_processor;

	import java.io.File;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.concurrent.ConcurrentHashMap;
	import org.apache.commons.lang3.StringUtils;
	import opennlp.tools.jsmlearning.ProfileReaderWriter;
	import opennlp.tools.similarity.apps.BingQueryRunner;
	import opennlp.tools.similarity.apps.HitBase;
	import opennlp.tools.similarity.apps.utils.ValueSortMap;
	import opennlp.tools.stemmer.PStemmer;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.TextProcessor;

	public class LinguisticPhraseManager {
	private Map<String, Integer> freq = new ConcurrentHashMap<String, Integer>();

	// the purpose to init this static object is to show the path to resources
	private static StopList stop = StopList.getInstance(new File(".").getAbsolutePath().replace(".","")+ "src/test/resources/");

	// this list will be overwritten by the external synonyms.csv
	private static String[][] synonymPairs = new String[][]{};
	private PStemmer stemmer = new PStemmer();

	private List<ParseTreeChunk> lingPhrases = new ArrayList<ParseTreeChunk>();
	private List<String> standardizedTopics = new ArrayList<String>();
	// map which shows for each ling phrase the list of ling phrases with the same head noun it belongs
	private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>();

	// map which shows for each string phrase the list of ling phrases with the same head noun it belongs
	private Map<String, List<ParseTreeChunk>> std_group = new ConcurrentHashMap<String, List<ParseTreeChunk>>();

	private BingQueryRunner runner = new BingQueryRunner();
	private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5
	private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3;
	// this function takes a log of a chain of the nodes of parse trees and builds their instances
	// the phrases should only be VP or NP, otherwise an exception should be thrown



	private String resourceDir;
	public LinguisticPhraseManager(){
	try {
	resourceDir = new File( "." ).getCanonicalPath()+"/src/main/resources/";
	List<String[]> vocabs = ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv");
	synonymPairs = new String[vocabs.size()][2];
	int count = 0;
	for(String[] line: vocabs){
	try {
	synonymPairs[count] = line;
	count++;
	} catch (Exception e) {
	e.printStackTrace();
	}
	}

	} catch (Exception e) {
	e.printStackTrace();
	}
	}

	private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){
	ParseTreeChunk ch = new ParseTreeChunk();
	List<String> POSs = new ArrayList<String>(), lemmas = new ArrayList<String>();

	String[] parts = phrStr.replace("]","").split(", <");

	ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'"));
	try {
	for(String part: parts){
	String lemma = StringUtils.substringBetween(part, "P'", "':").toLowerCase();
	String pos = part.substring(part.indexOf(":")+1, part.length());

	if (pos==null \|\| lemma ==null){
	continue;
	}
	POSs.add(pos.trim());
	lemmas.add(lemma.trim());
	ch.setPOSs(POSs); ch.setLemmas(lemmas);
	}
	} catch (Exception e) {
	// we expect exceptions if extracted phrases are NEITHER NP nor VP
	// empty chunk will be given which will not create a new topic
	e.printStackTrace();
	}

	return ch;
	}

	// this is a constructor with an array of extraction files
	// optimized for performance
	// only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER times will be considered
	public LinguisticPhraseManager(String[] loadPaths){
	List<String[]> columns = new ArrayList<String[]>();
	for(String file: loadPaths){
	columns.addAll(ProfileReaderWriter.readProfiles( file));
	}

	for(String[] l: columns){
	if (l.length<3 \|\| l[1]==null \|\| l[2]==null)
	continue;
	String word = l[1].toLowerCase().trim();
	if (word.indexOf("=>")>-1)
	continue;

	word = isAcceptableStringPhrase(word);
	if (word==null)
	continue;

	if (!freq.containsKey(word)) {
	freq.put(word, 1);

	} else {
	freq.put(word, freq.get(word) + 1);
	// once we reached the count for a topic, create it
	if (freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){
	ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
	ch = isAcceptableLingPhrase(ch);
	if (ch==null)
	continue;
	lingPhrases.add(ch);
	}
	}
	}
	// we dont need frequency data any more
	freq.clear();
	}

	// this is a default constructor with a single topic extraction file
	// not optimized for performance
	public LinguisticPhraseManager(String loadPath){
	List<String[]> columns = ProfileReaderWriter.readProfiles( loadPath);
	for(String[] l: columns){
	if (l.length<3 \|\| l[1]==null \|\| l[2]==null)
	continue;
	String word = l[1].toLowerCase().trim();
	if (word.indexOf("=>")>-1)
	continue;

	word = isAcceptableStringPhrase(word);
	if (word==null)
	continue;

	if (!freq.containsKey(word)) {

	ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
	ch = isAcceptableLingPhrase(ch);
	if (ch==null)
	continue;
	freq.put(word, 1);
	lingPhrases.add(ch);
	} else {
	freq.put(word, freq.get(word) + 1);
	}


	}
	freq = ValueSortMap.sortMapByValue(freq, false);


	}
	// removing prepositions and articles in case it has not worked at phrase forming stage
	private String isAcceptableStringPhrase(String word) {
	if (word.startsWith("to "))
	return null;
	if (word.startsWith("a "))
	return word.substring(2, word.length());

	if (word.endsWith(" !") \|\| word.endsWith(" ."))
	return word.substring(0, word.length()-2).trim();

	return word;
	}
	// we only accept NP
	private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) {
	if (!ch.getMainPOS().equals("NP"))
	return null;


	return ch;
	}

	// groups are sets of phrases with the same head noun
	// put all phrases in a group. Have a map from each phrase to its group: the list of members
	public void doLingGrouping(){
	for(int i=0; i< lingPhrases.size(); i++){
	for(int j=i+1; j< lingPhrases.size(); j++){
	ParseTreeChunk chI = lingPhrases.get(i);
	ParseTreeChunk chJ = lingPhrases.get(j);
	if (chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1))
	&& chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){
	List<ParseTreeChunk> values = null;
	if( chI.getLemmas().size()<chJ.getLemmas().size()){

	if (values == null)
	values = new ArrayList<ParseTreeChunk>();
	values.add(chI);
	entry_group.put(chJ, values);
	} else {
	values = entry_group.get(chI);
	if (values == null)
	values = new ArrayList<ParseTreeChunk>();
	values.add(chJ);
	entry_group.put(chI, values);
	}
	}
	}
	}


	}

	public List<String> formStandardizedTopic(){
	Set<ParseTreeChunk> keys = entry_group.keySet();
	for(ParseTreeChunk k: keys){
	List<ParseTreeChunk> lingPhrases = entry_group.get(k);
	for(int i=0; i< lingPhrases.size(); i++)
	for(int j=i+1; j< lingPhrases.size(); j++){
	ParseTreeChunk chI = lingPhrases.get(i);
	ParseTreeChunk chJ = lingPhrases.get(j);
	List<String> lemmas = new ArrayList<String>(chI.getLemmas());
	lemmas.retainAll(chJ.getLemmas());
	if (lemmas.size()<2)
	continue;
	String buf = ""; List<String> candTopicLst = new ArrayList<String>();
	for(String w: lemmas){
	if (w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER)
	continue;
	if (!StringUtils.isAlpha(w))
	continue;
	// find POS of w
	boolean bAccept = false;
	for(int iw=0; iw<chI.getLemmas().size(); iw++){
	if (w.equals(chI.getLemmas().get(iw))){
	if (chI.getPOSs().get(iw).startsWith("NN") \|\| chI.getPOSs().get(iw).startsWith("JJ")
	\|\| chI.getPOSs().get(iw).startsWith("VB"))
	bAccept=true;
	}
	}
	if (bAccept){
	//buf+=w+" ";
	String ws = substituteSynonym(w);
	candTopicLst.add(ws);
	}
	}
	// remove duplicates like 'new new house'
	//candTopicLst = new ArrayList<String>(new HashSet<String>(candTopicLst));
	for(String w: candTopicLst){
	buf+=w+" ";
	}

	buf = buf.trim();
	if (buf.indexOf(' ')<0)
	continue;

	if (!standardizedTopics.contains(buf)){
	standardizedTopics.add(buf);
	std_group.put(buf, lingPhrases);
	}
	}
	}
	cleanUpStandardizedTopics();

	return standardizedTopics;
	}

	public void cleanUpStandardizedTopics(){
	List<String> toDelete = new ArrayList<String>();
	for(int i=0; i< standardizedTopics.size(); i++)
	for(int j=i+1; j< standardizedTopics.size(); j++){
	List<String> t1 = TextProcessor.fastTokenize(standardizedTopics.get(i), false);
	List<String> t2 = TextProcessor.fastTokenize(standardizedTopics.get(j), false);
	for(int k=0; k< t1.size(); k++){
	t1.set(k, stemmer.stem(t1.get(k)));
	}
	for(int k=0; k< t2.size(); k++){
	t2.set(k, stemmer.stem(t2.get(k)));
	}
	// check if lists are equal
	if (t1.size()!=t2.size())
	continue;
	//if in two phrases once all keywords are tokenized, one phrase annihilates another,
	t1.removeAll(t2);
	if (t1.isEmpty()){
	if (standardizedTopics.get(i).length()> standardizedTopics.get(j).length()){
	toDelete.add(standardizedTopics.get(i));
	// TODO update std_group entry
	System.out.println("Removing '" + standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) );
	List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
	stJ.addAll(std_group.get(standardizedTopics.get(i)));
	stJ = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ));
	std_group.put(standardizedTopics.get(j), stJ);
	}
	else {
	toDelete.add(standardizedTopics.get(j));
	System.out.println("Removing '" + standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) );
	List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
	stI.addAll(std_group.get(standardizedTopics.get(j)));
	stI = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI));
	std_group.put(standardizedTopics.get(i), stI);
	}

	}
	}
	for(String d: toDelete){
	//System.out.println("Removed '" + d + "'");
	standardizedTopics.remove(d);
	}
	}

	// substitute synonyms according to internal vocab
	private String substituteSynonym(String w) {
	try {
	for(String[] pair: synonymPairs){
	if (w.equals(pair[0]))
	return pair[1];
	}
	} catch (Exception e) {
	e.printStackTrace();
	}
	return w;
	}

	public void generateGroupingReport(String reportName){
	List<String[]> report = new ArrayList<String[]>();
	Set<ParseTreeChunk> chs = entry_group.keySet();
	report.add(new String[]{"string phrase" , "class", "linguistic phrase", "list of ling phrases class representatives"});

	for(ParseTreeChunk ch: chs){
	String head = ch.getLemmas().get(ch.getLemmas().size()-1);
	List<ParseTreeChunk> values = entry_group.get(ch);
	if (values.size()<6)
	head = "";
	report.add(new String[]{ch.toWordOnlyString(), head, ch.toString(), values.toString()});
	}
	ProfileReaderWriter.writeReport(report, reportName);
	}

	//final merge floor-floors-flooring as head nound with phrase update
	public void applyLastRoundOfAggregation(){
	//merge <floor - floors - flooring>
	/*
	List<ParseTreeChunk> entries = new ArrayList<ParseTreeChunk>(entry_group.keySet());
	for(int i=0; i< entries.size(); i++){
	for(int j=i+1; j< entries.size(); j++){
	ParseTreeChunk chI = entries.get(i);
	ParseTreeChunk chJ = entries.get(j);
	String headI = getLastElement(chI.getLemmas());
	String headJ = getLastElement(chJ.getLemmas());
	if (headI==null \|\| headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER \|\|
	headJ==null \|\| headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER )
	continue;

	if (headI.indexOf(headJ)>-1){
	//leave headJ
	List<ParseTreeChunk> valuesToAddTo = entry_group.get(chJ);
	List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chI);
	if (valuesToAddTo==null \|\| valuesBeingAdded == null)
	continue;
	valuesToAddTo.addAll(valuesBeingAdded);
	entry_group.put(chJ, valuesToAddTo);
	entry_group.remove(chI);
	System.out.println("Deleting entry '"+ headI +"' and moving group to entry '"+ headJ +"'");
	} else if (headJ.indexOf(headI)>-1){
	//leave headJ
	List<ParseTreeChunk> valuesToAddTo = entry_group.get(chI);
	List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chJ);
	if (valuesToAddTo==null \|\| valuesBeingAdded == null)
	continue;
	valuesToAddTo.addAll(valuesBeingAdded);
	entry_group.put(chI, valuesToAddTo);
	entry_group.remove(chJ);
	System.out.println("Deleting entry '"+ headJ +"' and moving group to entry '"+ headI +"'");
	}

	}
	}
	*/
	for(int i = 0; i<standardizedTopics.size(); i++ )
	for(int j = i+1; j<standardizedTopics.size(); j++ ){
	String headI = extractHeadNounFromPhrase(standardizedTopics.get(i));
	String headJ = extractHeadNounFromPhrase(standardizedTopics.get(j));
	// if the same word do nothing
	if (headI.equals(headJ))
	continue;

	//only if one is sub-word of another
	if (headI.indexOf(headJ)>-1){

	if (!properSubWordForm(headI, headJ))
	continue;
	//entry 'I' will be updated
	String newKey = standardizedTopics.get(i).replace(headI, headJ);

	List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
	List<ParseTreeChunk> stInew = std_group.get(newKey);
	//if (stInew!=null && !stInew.isEmpty())
	// stI.addAll(stInew);
	if(stI==null)
	continue;
	std_group.put(newKey, stI);
	std_group.remove(standardizedTopics.get(i));
	System.out.println("Deleted entry for key '"+ standardizedTopics.get(i) +"' and created '"+ newKey +"'");
	standardizedTopics.set(i, newKey);

	} else if (headJ.indexOf(headI)>-1){
	if (!properSubWordForm(headJ, headI))
	continue;
	//entry 'J' will be updated
	String newKey = standardizedTopics.get(j).replace(headJ, headI);

	List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
	List<ParseTreeChunk> stJnew = std_group.get(newKey);
	//if (stJnew!=null && !stJnew.isEmpty())
	// stJ.addAll(stJnew);
	if(stJ==null)
	continue;
	std_group.put(newKey, stJ);
	std_group.remove(standardizedTopics.get(j));
	System.out.println("Deleted entry for key '"+ standardizedTopics.get(j) +"' and created '"+ newKey +"'");
	standardizedTopics.set(j, newKey);
	}
	}



	}

	private boolean properSubWordForm(String headI, String headJ) {
	String suffix = headI.replace(headJ, "");
	if (suffix.equals("s") \|\| suffix.equals("ing") //\|\| suffix.equals("er")
	\|\| suffix.equals("rooms") \|\|
	suffix.equals("") \|\| suffix.equals("counter") \|\|
	suffix.equals("room") \|\| suffix.equals("back"))
	return true;

	//System.out.println("Wrong word '"+ headI + "'reduction into '" + headJ +"'");
	return false;
	}

	//generates report
	public void generateStdTopicReport(String reportName){
	List<String[]> report = new ArrayList<String[]>();
	report.add(new String[]{"category", "topic", "sub-topics", "phrase instances" });

	for(String t: standardizedTopics){

	String bufCover = "";
	int count = 0;
	List<ParseTreeChunk> ptcList = std_group.get(t);
	if (ptcList == null)
	continue;
	for(ParseTreeChunk ch: ptcList){
	List<String> candidate = TextProcessor.fastTokenize(ch.toWordOnlyString(), false);
	List<String> tList = TextProcessor.fastTokenize(t, false);
	List<String> tListChk = new ArrayList<String>(tList);

	tListChk.removeAll(candidate);
	// fully covered by phrase instance
	if (!tListChk.isEmpty() \|\| ch.toWordOnlyString().equals(t)){
	continue;
	}

	boolean bCovered = true;

	for(String ts: tList){
	boolean bCandWordsIsCovered = false;
	for(String s: candidate){
	if ((s.indexOf(ts)>-1) )// && properSubWordForm(s, ts))
	bCandWordsIsCovered = true;
	}
	if (!bCandWordsIsCovered){
	bCovered = false;
	break;
	}
	}
	if (!bCovered)
	continue;
	bufCover+=ch.toWordOnlyString()+ " # ";
	count++;
	if (count > 40)
	break;

	}
	if (bufCover.endsWith(" # "))
	bufCover = bufCover.substring(0, bufCover.length()-3).trim();

	String buf = "";
	count = 0;
	// only up to 40 instances of phrases per 1-st level topic
	for(ParseTreeChunk ch: ptcList){
	buf+=ch.toWordOnlyString()+ "\|";
	count++;
	if (count > 40)
	break;
	}

	//TODO uncomment
	//t = spell.getSpellCheckResult(t);
	report.add(new String[]{extractHeadNounFromPhrase(t), t, bufCover, buf //, std_group.get(t).toString()
	});
	}


	ProfileReaderWriter.writeReport(report, reportName);
	}
	// get a last word from a phrase (supposed to be a head noun)
	private String extractHeadNounFromPhrase(String topic){
	String[] tops = topic.split(" ");
	int len = tops.length;
	if (len>1){
	return tops[len-1];
	}
	else return topic;
	}

	// get last elem of a list
	private String getLastElement(List<String> arrayList ){
	if (arrayList != null && !arrayList.isEmpty()) {
	return arrayList.get(arrayList.size()-1);
	}
	return null;
	}
	/*
	* Using Bing API to check if an extracted phrase can be found on the web, therefore is a meaningful phrase
	*/
	public List<String> verifyTopic(){
	Set<String> phrases = freq.keySet();
	List<String> approvedPhrases = new ArrayList<String>();
	for(String p: phrases){
	List<HitBase> hits = runner.runSearch("\""+p+"\"");
	for(HitBase h: hits){
	String lookup = h.getTitle() + " " + h.getAbstractText();
	if (lookup.indexOf(p)>-1){
	approvedPhrases.add(p);
	break;
	}
	}
	}
	return approvedPhrases;
	}

	public Set<String> getPhraseLookup(){
	return freq.keySet();
	}

	// using phrase frequency to filter phrases
	public boolean isAcceptablePhrase(String phrase){
	Integer count = freq.get(phrase.toLowerCase().trim());
	if (count==null)
	return false;

	if (count>0 && count < 10000)
	return true;
	return false;
	}

	public static void main(String[] args){
	LinguisticPhraseManager man = new LinguisticPhraseManager(
	"/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv");
	man.doLingGrouping();
	man.generateGroupingReport("topics_groups7_mergedHeads.csv");
	List<String> stdTopics = man.formStandardizedTopic();
	man.applyLastRoundOfAggregation();
	man.generateStdTopicReport("std_topics7_mergedHeads.csv");
	System.out.println(stdTopics);

	}
	}