blob: 0f53ec5c99deb97091d04c635c4eb02bdb9cfca6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.opinion_processor;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.lang3.StringUtils;
import opennlp.tools.jsmlearning.ProfileReaderWriter;
import opennlp.tools.similarity.apps.BingQueryRunner;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.utils.ValueSortMap;
import opennlp.tools.stemmer.PStemmer;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.TextProcessor;
public class LinguisticPhraseManager {
private Map<String, Integer> freq = new ConcurrentHashMap<String, Integer>();
// the purpose to init this static object is to show the path to resources
private static StopList stop = StopList.getInstance(new File(".").getAbsolutePath().replace(".","")+ "src/test/resources/");
// this list will be overwritten by the external synonyms.csv
private static String[][] synonymPairs = new String[][]{};
private PStemmer stemmer = new PStemmer();
private List<ParseTreeChunk> lingPhrases = new ArrayList<ParseTreeChunk>();
private List<String> standardizedTopics = new ArrayList<String>();
// map which shows for each ling phrase the list of ling phrases with the same head noun it belongs
private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>();
// map which shows for each string phrase the list of ling phrases with the same head noun it belongs
private Map<String, List<ParseTreeChunk>> std_group = new ConcurrentHashMap<String, List<ParseTreeChunk>>();
private BingQueryRunner runner = new BingQueryRunner();
private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5
private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3;
// this function takes a log of a chain of the nodes of parse trees and builds their instances
// the phrases should only be VP or NP, otherwise an exception should be thrown
private String resourceDir;
public LinguisticPhraseManager(){
try {
resourceDir = new File( "." ).getCanonicalPath()+"/src/main/resources/";
List<String[]> vocabs = ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv");
synonymPairs = new String[vocabs.size()][2];
int count = 0;
for(String[] line: vocabs){
try {
synonymPairs[count] = line;
count++;
} catch (Exception e) {
e.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){
ParseTreeChunk ch = new ParseTreeChunk();
List<String> POSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
String[] parts = phrStr.replace("]","").split(", <");
ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'"));
try {
for(String part: parts){
String lemma = StringUtils.substringBetween(part, "P'", "':").toLowerCase();
String pos = part.substring(part.indexOf(":")+1, part.length());
if (pos==null || lemma ==null){
continue;
}
POSs.add(pos.trim());
lemmas.add(lemma.trim());
ch.setPOSs(POSs); ch.setLemmas(lemmas);
}
} catch (Exception e) {
// we expect exceptions if extracted phrases are NEITHER NP nor VP
// empty chunk will be given which will not create a new topic
e.printStackTrace();
}
return ch;
}
// this is a constructor with an array of extraction files
// optimized for performance
// only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER times will be considered
public LinguisticPhraseManager(String[] loadPaths){
List<String[]> columns = new ArrayList<String[]>();
for(String file: loadPaths){
columns.addAll(ProfileReaderWriter.readProfiles( file));
}
for(String[] l: columns){
if (l.length<3 || l[1]==null || l[2]==null)
continue;
String word = l[1].toLowerCase().trim();
if (word.indexOf("=>")>-1)
continue;
word = isAcceptableStringPhrase(word);
if (word==null)
continue;
if (!freq.containsKey(word)) {
freq.put(word, 1);
} else {
freq.put(word, freq.get(word) + 1);
// once we reached the count for a topic, create it
if (freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){
ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
ch = isAcceptableLingPhrase(ch);
if (ch==null)
continue;
lingPhrases.add(ch);
}
}
}
// we dont need frequency data any more
freq.clear();
}
// this is a default constructor with a single topic extraction file
// not optimized for performance
public LinguisticPhraseManager(String loadPath){
List<String[]> columns = ProfileReaderWriter.readProfiles( loadPath);
for(String[] l: columns){
if (l.length<3 || l[1]==null || l[2]==null)
continue;
String word = l[1].toLowerCase().trim();
if (word.indexOf("=>")>-1)
continue;
word = isAcceptableStringPhrase(word);
if (word==null)
continue;
if (!freq.containsKey(word)) {
ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
ch = isAcceptableLingPhrase(ch);
if (ch==null)
continue;
freq.put(word, 1);
lingPhrases.add(ch);
} else {
freq.put(word, freq.get(word) + 1);
}
}
freq = ValueSortMap.sortMapByValue(freq, false);
}
// removing prepositions and articles in case it has not worked at phrase forming stage
private String isAcceptableStringPhrase(String word) {
if (word.startsWith("to "))
return null;
if (word.startsWith("a "))
return word.substring(2, word.length());
if (word.endsWith(" !") || word.endsWith(" ."))
return word.substring(0, word.length()-2).trim();
return word;
}
// we only accept NP
private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) {
if (!ch.getMainPOS().equals("NP"))
return null;
return ch;
}
// groups are sets of phrases with the same head noun
// put all phrases in a group. Have a map from each phrase to its group: the list of members
public void doLingGrouping(){
for(int i=0; i< lingPhrases.size(); i++){
for(int j=i+1; j< lingPhrases.size(); j++){
ParseTreeChunk chI = lingPhrases.get(i);
ParseTreeChunk chJ = lingPhrases.get(j);
if (chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1))
&& chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){
List<ParseTreeChunk> values = null;
if( chI.getLemmas().size()<chJ.getLemmas().size()){
if (values == null)
values = new ArrayList<ParseTreeChunk>();
values.add(chI);
entry_group.put(chJ, values);
} else {
values = entry_group.get(chI);
if (values == null)
values = new ArrayList<ParseTreeChunk>();
values.add(chJ);
entry_group.put(chI, values);
}
}
}
}
}
public List<String> formStandardizedTopic(){
Set<ParseTreeChunk> keys = entry_group.keySet();
for(ParseTreeChunk k: keys){
List<ParseTreeChunk> lingPhrases = entry_group.get(k);
for(int i=0; i< lingPhrases.size(); i++)
for(int j=i+1; j< lingPhrases.size(); j++){
ParseTreeChunk chI = lingPhrases.get(i);
ParseTreeChunk chJ = lingPhrases.get(j);
List<String> lemmas = new ArrayList<String>(chI.getLemmas());
lemmas.retainAll(chJ.getLemmas());
if (lemmas.size()<2)
continue;
String buf = ""; List<String> candTopicLst = new ArrayList<String>();
for(String w: lemmas){
if (w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER)
continue;
if (!StringUtils.isAlpha(w))
continue;
// find POS of w
boolean bAccept = false;
for(int iw=0; iw<chI.getLemmas().size(); iw++){
if (w.equals(chI.getLemmas().get(iw))){
if (chI.getPOSs().get(iw).startsWith("NN") || chI.getPOSs().get(iw).startsWith("JJ")
|| chI.getPOSs().get(iw).startsWith("VB"))
bAccept=true;
}
}
if (bAccept){
//buf+=w+" ";
String ws = substituteSynonym(w);
candTopicLst.add(ws);
}
}
// remove duplicates like 'new new house'
//candTopicLst = new ArrayList<String>(new HashSet<String>(candTopicLst));
for(String w: candTopicLst){
buf+=w+" ";
}
buf = buf.trim();
if (buf.indexOf(' ')<0)
continue;
if (!standardizedTopics.contains(buf)){
standardizedTopics.add(buf);
std_group.put(buf, lingPhrases);
}
}
}
cleanUpStandardizedTopics();
return standardizedTopics;
}
public void cleanUpStandardizedTopics(){
List<String> toDelete = new ArrayList<String>();
for(int i=0; i< standardizedTopics.size(); i++)
for(int j=i+1; j< standardizedTopics.size(); j++){
List<String> t1 = TextProcessor.fastTokenize(standardizedTopics.get(i), false);
List<String> t2 = TextProcessor.fastTokenize(standardizedTopics.get(j), false);
for(int k=0; k< t1.size(); k++){
t1.set(k, stemmer.stem(t1.get(k)));
}
for(int k=0; k< t2.size(); k++){
t2.set(k, stemmer.stem(t2.get(k)));
}
// check if lists are equal
if (t1.size()!=t2.size())
continue;
//if in two phrases once all keywords are tokenized, one phrase annihilates another,
t1.removeAll(t2);
if (t1.isEmpty()){
if (standardizedTopics.get(i).length()> standardizedTopics.get(j).length()){
toDelete.add(standardizedTopics.get(i));
// TODO update std_group entry
System.out.println("Removing '" + standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) );
List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
stJ.addAll(std_group.get(standardizedTopics.get(i)));
stJ = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ));
std_group.put(standardizedTopics.get(j), stJ);
}
else {
toDelete.add(standardizedTopics.get(j));
System.out.println("Removing '" + standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) );
List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
stI.addAll(std_group.get(standardizedTopics.get(j)));
stI = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI));
std_group.put(standardizedTopics.get(i), stI);
}
}
}
for(String d: toDelete){
//System.out.println("Removed '" + d + "'");
standardizedTopics.remove(d);
}
}
// substitute synonyms according to internal vocab
private String substituteSynonym(String w) {
try {
for(String[] pair: synonymPairs){
if (w.equals(pair[0]))
return pair[1];
}
} catch (Exception e) {
e.printStackTrace();
}
return w;
}
public void generateGroupingReport(String reportName){
List<String[]> report = new ArrayList<String[]>();
Set<ParseTreeChunk> chs = entry_group.keySet();
report.add(new String[]{"string phrase" , "class", "linguistic phrase", "list of ling phrases class representatives"});
for(ParseTreeChunk ch: chs){
String head = ch.getLemmas().get(ch.getLemmas().size()-1);
List<ParseTreeChunk> values = entry_group.get(ch);
if (values.size()<6)
head = "";
report.add(new String[]{ch.toWordOnlyString(), head, ch.toString(), values.toString()});
}
ProfileReaderWriter.writeReport(report, reportName);
}
//final merge floor-floors-flooring as head nound with phrase update
public void applyLastRoundOfAggregation(){
//merge <floor - floors - flooring>
/*
List<ParseTreeChunk> entries = new ArrayList<ParseTreeChunk>(entry_group.keySet());
for(int i=0; i< entries.size(); i++){
for(int j=i+1; j< entries.size(); j++){
ParseTreeChunk chI = entries.get(i);
ParseTreeChunk chJ = entries.get(j);
String headI = getLastElement(chI.getLemmas());
String headJ = getLastElement(chJ.getLemmas());
if (headI==null || headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER ||
headJ==null || headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER )
continue;
if (headI.indexOf(headJ)>-1){
//leave headJ
List<ParseTreeChunk> valuesToAddTo = entry_group.get(chJ);
List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chI);
if (valuesToAddTo==null || valuesBeingAdded == null)
continue;
valuesToAddTo.addAll(valuesBeingAdded);
entry_group.put(chJ, valuesToAddTo);
entry_group.remove(chI);
System.out.println("Deleting entry '"+ headI +"' and moving group to entry '"+ headJ +"'");
} else if (headJ.indexOf(headI)>-1){
//leave headJ
List<ParseTreeChunk> valuesToAddTo = entry_group.get(chI);
List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chJ);
if (valuesToAddTo==null || valuesBeingAdded == null)
continue;
valuesToAddTo.addAll(valuesBeingAdded);
entry_group.put(chI, valuesToAddTo);
entry_group.remove(chJ);
System.out.println("Deleting entry '"+ headJ +"' and moving group to entry '"+ headI +"'");
}
}
}
*/
for(int i = 0; i<standardizedTopics.size(); i++ )
for(int j = i+1; j<standardizedTopics.size(); j++ ){
String headI = extractHeadNounFromPhrase(standardizedTopics.get(i));
String headJ = extractHeadNounFromPhrase(standardizedTopics.get(j));
// if the same word do nothing
if (headI.equals(headJ))
continue;
//only if one is sub-word of another
if (headI.indexOf(headJ)>-1){
if (!properSubWordForm(headI, headJ))
continue;
//entry 'I' will be updated
String newKey = standardizedTopics.get(i).replace(headI, headJ);
List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
List<ParseTreeChunk> stInew = std_group.get(newKey);
//if (stInew!=null && !stInew.isEmpty())
// stI.addAll(stInew);
if(stI==null)
continue;
std_group.put(newKey, stI);
std_group.remove(standardizedTopics.get(i));
System.out.println("Deleted entry for key '"+ standardizedTopics.get(i) +"' and created '"+ newKey +"'");
standardizedTopics.set(i, newKey);
} else if (headJ.indexOf(headI)>-1){
if (!properSubWordForm(headJ, headI))
continue;
//entry 'J' will be updated
String newKey = standardizedTopics.get(j).replace(headJ, headI);
List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
List<ParseTreeChunk> stJnew = std_group.get(newKey);
//if (stJnew!=null && !stJnew.isEmpty())
// stJ.addAll(stJnew);
if(stJ==null)
continue;
std_group.put(newKey, stJ);
std_group.remove(standardizedTopics.get(j));
System.out.println("Deleted entry for key '"+ standardizedTopics.get(j) +"' and created '"+ newKey +"'");
standardizedTopics.set(j, newKey);
}
}
}
private boolean properSubWordForm(String headI, String headJ) {
String suffix = headI.replace(headJ, "");
if (suffix.equals("s") || suffix.equals("ing") //|| suffix.equals("er")
|| suffix.equals("rooms") ||
suffix.equals("") || suffix.equals("counter") ||
suffix.equals("room") || suffix.equals("back"))
return true;
//System.out.println("Wrong word '"+ headI + "'reduction into '" + headJ +"'");
return false;
}
//generates report
public void generateStdTopicReport(String reportName){
List<String[]> report = new ArrayList<String[]>();
report.add(new String[]{"category", "topic", "sub-topics", "phrase instances" });
for(String t: standardizedTopics){
String bufCover = "";
int count = 0;
List<ParseTreeChunk> ptcList = std_group.get(t);
if (ptcList == null)
continue;
for(ParseTreeChunk ch: ptcList){
List<String> candidate = TextProcessor.fastTokenize(ch.toWordOnlyString(), false);
List<String> tList = TextProcessor.fastTokenize(t, false);
List<String> tListChk = new ArrayList<String>(tList);
tListChk.removeAll(candidate);
// fully covered by phrase instance
if (!tListChk.isEmpty() || ch.toWordOnlyString().equals(t)){
continue;
}
boolean bCovered = true;
for(String ts: tList){
boolean bCandWordsIsCovered = false;
for(String s: candidate){
if ((s.indexOf(ts)>-1) )// && properSubWordForm(s, ts))
bCandWordsIsCovered = true;
}
if (!bCandWordsIsCovered){
bCovered = false;
break;
}
}
if (!bCovered)
continue;
bufCover+=ch.toWordOnlyString()+ " # ";
count++;
if (count > 40)
break;
}
if (bufCover.endsWith(" # "))
bufCover = bufCover.substring(0, bufCover.length()-3).trim();
String buf = "";
count = 0;
// only up to 40 instances of phrases per 1-st level topic
for(ParseTreeChunk ch: ptcList){
buf+=ch.toWordOnlyString()+ "|";
count++;
if (count > 40)
break;
}
//TODO uncomment
//t = spell.getSpellCheckResult(t);
report.add(new String[]{extractHeadNounFromPhrase(t), t, bufCover, buf //, std_group.get(t).toString()
});
}
ProfileReaderWriter.writeReport(report, reportName);
}
// get a last word from a phrase (supposed to be a head noun)
private String extractHeadNounFromPhrase(String topic){
String[] tops = topic.split(" ");
int len = tops.length;
if (len>1){
return tops[len-1];
}
else return topic;
}
// get last elem of a list
private String getLastElement(List<String> arrayList ){
if (arrayList != null && !arrayList.isEmpty()) {
return arrayList.get(arrayList.size()-1);
}
return null;
}
/*
* Using Bing API to check if an extracted phrase can be found on the web, therefore is a meaningful phrase
*/
public List<String> verifyTopic(){
Set<String> phrases = freq.keySet();
List<String> approvedPhrases = new ArrayList<String>();
for(String p: phrases){
List<HitBase> hits = runner.runSearch("\""+p+"\"");
for(HitBase h: hits){
String lookup = h.getTitle() + " " + h.getAbstractText();
if (lookup.indexOf(p)>-1){
approvedPhrases.add(p);
break;
}
}
}
return approvedPhrases;
}
public Set<String> getPhraseLookup(){
return freq.keySet();
}
// using phrase frequency to filter phrases
public boolean isAcceptablePhrase(String phrase){
Integer count = freq.get(phrase.toLowerCase().trim());
if (count==null)
return false;
if (count>0 && count < 10000)
return true;
return false;
}
public static void main(String[] args){
LinguisticPhraseManager man = new LinguisticPhraseManager(
"/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv");
man.doLingGrouping();
man.generateGroupingReport("topics_groups7_mergedHeads.csv");
List<String> stdTopics = man.formStandardizedTopic();
man.applyLastRoundOfAggregation();
man.generateStdTopicReport("std_topics7_mergedHeads.csv");
System.out.println(stdTopics);
}
}