| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package opennlp.tools.parse_thicket.opinion_processor; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.concurrent.ConcurrentHashMap; |
| import org.apache.commons.lang3.StringUtils; |
| import opennlp.tools.jsmlearning.ProfileReaderWriter; |
| import opennlp.tools.similarity.apps.BingQueryRunner; |
| import opennlp.tools.similarity.apps.HitBase; |
| import opennlp.tools.similarity.apps.utils.ValueSortMap; |
| import opennlp.tools.stemmer.PStemmer; |
| import opennlp.tools.textsimilarity.ParseTreeChunk; |
| import opennlp.tools.textsimilarity.TextProcessor; |
| |
| public class LinguisticPhraseManager { |
| private Map<String, Integer> freq = new ConcurrentHashMap<String, Integer>(); |
| |
| // the purpose to init this static object is to show the path to resources |
| private static StopList stop = StopList.getInstance(new File(".").getAbsolutePath().replace(".","")+ "src/test/resources/"); |
| |
| // this list will be overwritten by the external synonyms.csv |
| private static String[][] synonymPairs = new String[][]{}; |
| private PStemmer stemmer = new PStemmer(); |
| |
| private List<ParseTreeChunk> lingPhrases = new ArrayList<ParseTreeChunk>(); |
| private List<String> standardizedTopics = new ArrayList<String>(); |
| // map which shows for each ling phrase the list of ling phrases with the same head noun it belongs |
| private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>(); |
| |
| // map which shows for each string phrase the list of ling phrases with the same head noun it belongs |
| private Map<String, List<ParseTreeChunk>> std_group = new ConcurrentHashMap<String, List<ParseTreeChunk>>(); |
| |
| private BingQueryRunner runner = new BingQueryRunner(); |
| private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5 |
| private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3; |
| // this function takes a log of a chain of the nodes of parse trees and builds their instances |
| // the phrases should only be VP or NP, otherwise an exception should be thrown |
| |
| |
| |
| private String resourceDir; |
| public LinguisticPhraseManager(){ |
| try { |
| resourceDir = new File( "." ).getCanonicalPath()+"/src/main/resources/"; |
| List<String[]> vocabs = ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv"); |
| synonymPairs = new String[vocabs.size()][2]; |
| int count = 0; |
| for(String[] line: vocabs){ |
| try { |
| synonymPairs[count] = line; |
| count++; |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
| |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
| |
| private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){ |
| ParseTreeChunk ch = new ParseTreeChunk(); |
| List<String> POSs = new ArrayList<String>(), lemmas = new ArrayList<String>(); |
| |
| String[] parts = phrStr.replace("]","").split(", <"); |
| |
| ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'")); |
| try { |
| for(String part: parts){ |
| String lemma = StringUtils.substringBetween(part, "P'", "':").toLowerCase(); |
| String pos = part.substring(part.indexOf(":")+1, part.length()); |
| |
| if (pos==null || lemma ==null){ |
| continue; |
| } |
| POSs.add(pos.trim()); |
| lemmas.add(lemma.trim()); |
| ch.setPOSs(POSs); ch.setLemmas(lemmas); |
| } |
| } catch (Exception e) { |
| // we expect exceptions if extracted phrases are NEITHER NP nor VP |
| // empty chunk will be given which will not create a new topic |
| e.printStackTrace(); |
| } |
| |
| return ch; |
| } |
| |
| // this is a constructor with an array of extraction files |
| // optimized for performance |
| // only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER times will be considered |
| public LinguisticPhraseManager(String[] loadPaths){ |
| List<String[]> columns = new ArrayList<String[]>(); |
| for(String file: loadPaths){ |
| columns.addAll(ProfileReaderWriter.readProfiles( file)); |
| } |
| |
| for(String[] l: columns){ |
| if (l.length<3 || l[1]==null || l[2]==null) |
| continue; |
| String word = l[1].toLowerCase().trim(); |
| if (word.indexOf("=>")>-1) |
| continue; |
| |
| word = isAcceptableStringPhrase(word); |
| if (word==null) |
| continue; |
| |
| if (!freq.containsKey(word)) { |
| freq.put(word, 1); |
| |
| } else { |
| freq.put(word, freq.get(word) + 1); |
| // once we reached the count for a topic, create it |
| if (freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){ |
| ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]); |
| ch = isAcceptableLingPhrase(ch); |
| if (ch==null) |
| continue; |
| lingPhrases.add(ch); |
| } |
| } |
| } |
| // we dont need frequency data any more |
| freq.clear(); |
| } |
| |
| // this is a default constructor with a single topic extraction file |
| // not optimized for performance |
| public LinguisticPhraseManager(String loadPath){ |
| List<String[]> columns = ProfileReaderWriter.readProfiles( loadPath); |
| for(String[] l: columns){ |
| if (l.length<3 || l[1]==null || l[2]==null) |
| continue; |
| String word = l[1].toLowerCase().trim(); |
| if (word.indexOf("=>")>-1) |
| continue; |
| |
| word = isAcceptableStringPhrase(word); |
| if (word==null) |
| continue; |
| |
| if (!freq.containsKey(word)) { |
| |
| ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]); |
| ch = isAcceptableLingPhrase(ch); |
| if (ch==null) |
| continue; |
| freq.put(word, 1); |
| lingPhrases.add(ch); |
| } else { |
| freq.put(word, freq.get(word) + 1); |
| } |
| |
| |
| } |
| freq = ValueSortMap.sortMapByValue(freq, false); |
| |
| |
| } |
| // removing prepositions and articles in case it has not worked at phrase forming stage |
| private String isAcceptableStringPhrase(String word) { |
| if (word.startsWith("to ")) |
| return null; |
| if (word.startsWith("a ")) |
| return word.substring(2, word.length()); |
| |
| if (word.endsWith(" !") || word.endsWith(" .")) |
| return word.substring(0, word.length()-2).trim(); |
| |
| return word; |
| } |
| // we only accept NP |
| private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) { |
| if (!ch.getMainPOS().equals("NP")) |
| return null; |
| |
| |
| return ch; |
| } |
| |
| // groups are sets of phrases with the same head noun |
| // put all phrases in a group. Have a map from each phrase to its group: the list of members |
| public void doLingGrouping(){ |
| for(int i=0; i< lingPhrases.size(); i++){ |
| for(int j=i+1; j< lingPhrases.size(); j++){ |
| ParseTreeChunk chI = lingPhrases.get(i); |
| ParseTreeChunk chJ = lingPhrases.get(j); |
| if (chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1)) |
| && chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){ |
| List<ParseTreeChunk> values = null; |
| if( chI.getLemmas().size()<chJ.getLemmas().size()){ |
| |
| if (values == null) |
| values = new ArrayList<ParseTreeChunk>(); |
| values.add(chI); |
| entry_group.put(chJ, values); |
| } else { |
| values = entry_group.get(chI); |
| if (values == null) |
| values = new ArrayList<ParseTreeChunk>(); |
| values.add(chJ); |
| entry_group.put(chI, values); |
| } |
| } |
| } |
| } |
| |
| |
| } |
| |
| public List<String> formStandardizedTopic(){ |
| Set<ParseTreeChunk> keys = entry_group.keySet(); |
| for(ParseTreeChunk k: keys){ |
| List<ParseTreeChunk> lingPhrases = entry_group.get(k); |
| for(int i=0; i< lingPhrases.size(); i++) |
| for(int j=i+1; j< lingPhrases.size(); j++){ |
| ParseTreeChunk chI = lingPhrases.get(i); |
| ParseTreeChunk chJ = lingPhrases.get(j); |
| List<String> lemmas = new ArrayList<String>(chI.getLemmas()); |
| lemmas.retainAll(chJ.getLemmas()); |
| if (lemmas.size()<2) |
| continue; |
| String buf = ""; List<String> candTopicLst = new ArrayList<String>(); |
| for(String w: lemmas){ |
| if (w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER) |
| continue; |
| if (!StringUtils.isAlpha(w)) |
| continue; |
| // find POS of w |
| boolean bAccept = false; |
| for(int iw=0; iw<chI.getLemmas().size(); iw++){ |
| if (w.equals(chI.getLemmas().get(iw))){ |
| if (chI.getPOSs().get(iw).startsWith("NN") || chI.getPOSs().get(iw).startsWith("JJ") |
| || chI.getPOSs().get(iw).startsWith("VB")) |
| bAccept=true; |
| } |
| } |
| if (bAccept){ |
| //buf+=w+" "; |
| String ws = substituteSynonym(w); |
| candTopicLst.add(ws); |
| } |
| } |
| // remove duplicates like 'new new house' |
| //candTopicLst = new ArrayList<String>(new HashSet<String>(candTopicLst)); |
| for(String w: candTopicLst){ |
| buf+=w+" "; |
| } |
| |
| buf = buf.trim(); |
| if (buf.indexOf(' ')<0) |
| continue; |
| |
| if (!standardizedTopics.contains(buf)){ |
| standardizedTopics.add(buf); |
| std_group.put(buf, lingPhrases); |
| } |
| } |
| } |
| cleanUpStandardizedTopics(); |
| |
| return standardizedTopics; |
| } |
| |
| public void cleanUpStandardizedTopics(){ |
| List<String> toDelete = new ArrayList<String>(); |
| for(int i=0; i< standardizedTopics.size(); i++) |
| for(int j=i+1; j< standardizedTopics.size(); j++){ |
| List<String> t1 = TextProcessor.fastTokenize(standardizedTopics.get(i), false); |
| List<String> t2 = TextProcessor.fastTokenize(standardizedTopics.get(j), false); |
| for(int k=0; k< t1.size(); k++){ |
| t1.set(k, stemmer.stem(t1.get(k))); |
| } |
| for(int k=0; k< t2.size(); k++){ |
| t2.set(k, stemmer.stem(t2.get(k))); |
| } |
| // check if lists are equal |
| if (t1.size()!=t2.size()) |
| continue; |
| //if in two phrases once all keywords are tokenized, one phrase annihilates another, |
| t1.removeAll(t2); |
| if (t1.isEmpty()){ |
| if (standardizedTopics.get(i).length()> standardizedTopics.get(j).length()){ |
| toDelete.add(standardizedTopics.get(i)); |
| // TODO update std_group entry |
| System.out.println("Removing '" + standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) ); |
| List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j)); |
| stJ.addAll(std_group.get(standardizedTopics.get(i))); |
| stJ = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ)); |
| std_group.put(standardizedTopics.get(j), stJ); |
| } |
| else { |
| toDelete.add(standardizedTopics.get(j)); |
| System.out.println("Removing '" + standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) ); |
| List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i)); |
| stI.addAll(std_group.get(standardizedTopics.get(j))); |
| stI = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI)); |
| std_group.put(standardizedTopics.get(i), stI); |
| } |
| |
| } |
| } |
| for(String d: toDelete){ |
| //System.out.println("Removed '" + d + "'"); |
| standardizedTopics.remove(d); |
| } |
| } |
| |
| // substitute synonyms according to internal vocab |
| private String substituteSynonym(String w) { |
| try { |
| for(String[] pair: synonymPairs){ |
| if (w.equals(pair[0])) |
| return pair[1]; |
| } |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| return w; |
| } |
| |
| public void generateGroupingReport(String reportName){ |
| List<String[]> report = new ArrayList<String[]>(); |
| Set<ParseTreeChunk> chs = entry_group.keySet(); |
| report.add(new String[]{"string phrase" , "class", "linguistic phrase", "list of ling phrases class representatives"}); |
| |
| for(ParseTreeChunk ch: chs){ |
| String head = ch.getLemmas().get(ch.getLemmas().size()-1); |
| List<ParseTreeChunk> values = entry_group.get(ch); |
| if (values.size()<6) |
| head = ""; |
| report.add(new String[]{ch.toWordOnlyString(), head, ch.toString(), values.toString()}); |
| } |
| ProfileReaderWriter.writeReport(report, reportName); |
| } |
| |
| //final merge floor-floors-flooring as head nound with phrase update |
| public void applyLastRoundOfAggregation(){ |
| //merge <floor - floors - flooring> |
| /* |
| List<ParseTreeChunk> entries = new ArrayList<ParseTreeChunk>(entry_group.keySet()); |
| for(int i=0; i< entries.size(); i++){ |
| for(int j=i+1; j< entries.size(); j++){ |
| ParseTreeChunk chI = entries.get(i); |
| ParseTreeChunk chJ = entries.get(j); |
| String headI = getLastElement(chI.getLemmas()); |
| String headJ = getLastElement(chJ.getLemmas()); |
| if (headI==null || headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER || |
| headJ==null || headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER ) |
| continue; |
| |
| if (headI.indexOf(headJ)>-1){ |
| //leave headJ |
| List<ParseTreeChunk> valuesToAddTo = entry_group.get(chJ); |
| List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chI); |
| if (valuesToAddTo==null || valuesBeingAdded == null) |
| continue; |
| valuesToAddTo.addAll(valuesBeingAdded); |
| entry_group.put(chJ, valuesToAddTo); |
| entry_group.remove(chI); |
| System.out.println("Deleting entry '"+ headI +"' and moving group to entry '"+ headJ +"'"); |
| } else if (headJ.indexOf(headI)>-1){ |
| //leave headJ |
| List<ParseTreeChunk> valuesToAddTo = entry_group.get(chI); |
| List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chJ); |
| if (valuesToAddTo==null || valuesBeingAdded == null) |
| continue; |
| valuesToAddTo.addAll(valuesBeingAdded); |
| entry_group.put(chI, valuesToAddTo); |
| entry_group.remove(chJ); |
| System.out.println("Deleting entry '"+ headJ +"' and moving group to entry '"+ headI +"'"); |
| } |
| |
| } |
| } |
| */ |
| for(int i = 0; i<standardizedTopics.size(); i++ ) |
| for(int j = i+1; j<standardizedTopics.size(); j++ ){ |
| String headI = extractHeadNounFromPhrase(standardizedTopics.get(i)); |
| String headJ = extractHeadNounFromPhrase(standardizedTopics.get(j)); |
| // if the same word do nothing |
| if (headI.equals(headJ)) |
| continue; |
| |
| //only if one is sub-word of another |
| if (headI.indexOf(headJ)>-1){ |
| |
| if (!properSubWordForm(headI, headJ)) |
| continue; |
| //entry 'I' will be updated |
| String newKey = standardizedTopics.get(i).replace(headI, headJ); |
| |
| List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i)); |
| List<ParseTreeChunk> stInew = std_group.get(newKey); |
| //if (stInew!=null && !stInew.isEmpty()) |
| // stI.addAll(stInew); |
| if(stI==null) |
| continue; |
| std_group.put(newKey, stI); |
| std_group.remove(standardizedTopics.get(i)); |
| System.out.println("Deleted entry for key '"+ standardizedTopics.get(i) +"' and created '"+ newKey +"'"); |
| standardizedTopics.set(i, newKey); |
| |
| } else if (headJ.indexOf(headI)>-1){ |
| if (!properSubWordForm(headJ, headI)) |
| continue; |
| //entry 'J' will be updated |
| String newKey = standardizedTopics.get(j).replace(headJ, headI); |
| |
| List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j)); |
| List<ParseTreeChunk> stJnew = std_group.get(newKey); |
| //if (stJnew!=null && !stJnew.isEmpty()) |
| // stJ.addAll(stJnew); |
| if(stJ==null) |
| continue; |
| std_group.put(newKey, stJ); |
| std_group.remove(standardizedTopics.get(j)); |
| System.out.println("Deleted entry for key '"+ standardizedTopics.get(j) +"' and created '"+ newKey +"'"); |
| standardizedTopics.set(j, newKey); |
| } |
| } |
| |
| |
| |
| } |
| |
| private boolean properSubWordForm(String headI, String headJ) { |
| String suffix = headI.replace(headJ, ""); |
| if (suffix.equals("s") || suffix.equals("ing") //|| suffix.equals("er") |
| || suffix.equals("rooms") || |
| suffix.equals("") || suffix.equals("counter") || |
| suffix.equals("room") || suffix.equals("back")) |
| return true; |
| |
| //System.out.println("Wrong word '"+ headI + "'reduction into '" + headJ +"'"); |
| return false; |
| } |
| |
| //generates report |
| public void generateStdTopicReport(String reportName){ |
| List<String[]> report = new ArrayList<String[]>(); |
| report.add(new String[]{"category", "topic", "sub-topics", "phrase instances" }); |
| |
| for(String t: standardizedTopics){ |
| |
| String bufCover = ""; |
| int count = 0; |
| List<ParseTreeChunk> ptcList = std_group.get(t); |
| if (ptcList == null) |
| continue; |
| for(ParseTreeChunk ch: ptcList){ |
| List<String> candidate = TextProcessor.fastTokenize(ch.toWordOnlyString(), false); |
| List<String> tList = TextProcessor.fastTokenize(t, false); |
| List<String> tListChk = new ArrayList<String>(tList); |
| |
| tListChk.removeAll(candidate); |
| // fully covered by phrase instance |
| if (!tListChk.isEmpty() || ch.toWordOnlyString().equals(t)){ |
| continue; |
| } |
| |
| boolean bCovered = true; |
| |
| for(String ts: tList){ |
| boolean bCandWordsIsCovered = false; |
| for(String s: candidate){ |
| if ((s.indexOf(ts)>-1) )// && properSubWordForm(s, ts)) |
| bCandWordsIsCovered = true; |
| } |
| if (!bCandWordsIsCovered){ |
| bCovered = false; |
| break; |
| } |
| } |
| if (!bCovered) |
| continue; |
| bufCover+=ch.toWordOnlyString()+ " # "; |
| count++; |
| if (count > 40) |
| break; |
| |
| } |
| if (bufCover.endsWith(" # ")) |
| bufCover = bufCover.substring(0, bufCover.length()-3).trim(); |
| |
| String buf = ""; |
| count = 0; |
| // only up to 40 instances of phrases per 1-st level topic |
| for(ParseTreeChunk ch: ptcList){ |
| buf+=ch.toWordOnlyString()+ "|"; |
| count++; |
| if (count > 40) |
| break; |
| } |
| |
| //TODO uncomment |
| //t = spell.getSpellCheckResult(t); |
| report.add(new String[]{extractHeadNounFromPhrase(t), t, bufCover, buf //, std_group.get(t).toString() |
| }); |
| } |
| |
| |
| ProfileReaderWriter.writeReport(report, reportName); |
| } |
| // get a last word from a phrase (supposed to be a head noun) |
| private String extractHeadNounFromPhrase(String topic){ |
| String[] tops = topic.split(" "); |
| int len = tops.length; |
| if (len>1){ |
| return tops[len-1]; |
| } |
| else return topic; |
| } |
| |
| // get last elem of a list |
| private String getLastElement(List<String> arrayList ){ |
| if (arrayList != null && !arrayList.isEmpty()) { |
| return arrayList.get(arrayList.size()-1); |
| } |
| return null; |
| } |
| /* |
| * Using Bing API to check if an extracted phrase can be found on the web, therefore is a meaningful phrase |
| */ |
| public List<String> verifyTopic(){ |
| Set<String> phrases = freq.keySet(); |
| List<String> approvedPhrases = new ArrayList<String>(); |
| for(String p: phrases){ |
| List<HitBase> hits = runner.runSearch("\""+p+"\""); |
| for(HitBase h: hits){ |
| String lookup = h.getTitle() + " " + h.getAbstractText(); |
| if (lookup.indexOf(p)>-1){ |
| approvedPhrases.add(p); |
| break; |
| } |
| } |
| } |
| return approvedPhrases; |
| } |
| |
| public Set<String> getPhraseLookup(){ |
| return freq.keySet(); |
| } |
| |
| // using phrase frequency to filter phrases |
| public boolean isAcceptablePhrase(String phrase){ |
| Integer count = freq.get(phrase.toLowerCase().trim()); |
| if (count==null) |
| return false; |
| |
| if (count>0 && count < 10000) |
| return true; |
| return false; |
| } |
| |
| public static void main(String[] args){ |
| LinguisticPhraseManager man = new LinguisticPhraseManager( |
| "/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv"); |
| man.doLingGrouping(); |
| man.generateGroupingReport("topics_groups7_mergedHeads.csv"); |
| List<String> stdTopics = man.formStandardizedTopic(); |
| man.applyLastRoundOfAggregation(); |
| man.generateStdTopicReport("std_topics7_mergedHeads.csv"); |
| System.out.println(stdTopics); |
| |
| } |
| } |