/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.apps.relevanceVocabs; | |
import java.util.ArrayList; | |
import java.util.Comparator; | |
import java.util.List; | |
import org.apache.commons.lang.StringUtils; | |
import opennlp.tools.parser.Parse; | |
import opennlp.tools.textsimilarity.ParseTreeChunk; | |
import opennlp.tools.textsimilarity.TextProcessor; | |
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; | |
import opennlp.tools.util.Span; | |
public class PhraseProcessor { | |
private ParserChunker2MatcherProcessor nlProc = ParserChunker2MatcherProcessor.getInstance() ; | |
public static boolean allChildNodesArePOSTags(Parse p) | |
{ | |
Parse[] subParses = p.getChildren(); | |
for (int pi = 0; pi < subParses.length; pi++) | |
if (!((Parse) subParses[pi]).isPosTag()) | |
return false; | |
return true; | |
} | |
public ArrayList<String> getNounPhrases(Parse p) | |
{ | |
ArrayList<String> nounphrases = new ArrayList<String>(); | |
Parse[] subparses = p.getChildren(); | |
for (int pi = 0; pi < subparses.length; pi++) | |
{ | |
if (subparses[pi].getType().equals("NP") && allChildNodesArePOSTags(subparses[pi])) | |
{ | |
Span _span = subparses[pi].getSpan(); | |
nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd())); | |
} | |
else if (!((Parse) subparses[pi]).isPosTag()) | |
nounphrases.addAll(getNounPhrases(subparses[pi])); | |
} | |
return nounphrases; | |
} | |
public ArrayList<String> getVerbPhrases(Parse p) | |
{ | |
ArrayList<String> verbPhrases = new ArrayList<String>(); | |
Parse[] subparses = p.getChildren(); | |
for (int pi = 0; pi < subparses.length; pi++) | |
{ | |
if (subparses[pi].getType().startsWith("VB") && allChildNodesArePOSTags(subparses[pi])) | |
{ | |
Span _span = subparses[pi].getSpan(); | |
verbPhrases.add(p.getText().substring(_span.getStart(), _span.getEnd())); | |
} | |
else if (!((Parse) subparses[pi]).isPosTag()) | |
verbPhrases.addAll(getNounPhrases(subparses[pi])); | |
} | |
return verbPhrases; | |
} | |
// forms phrases from text which are candidate expressions for events lookup | |
public List<ParseTreeChunk> getVerbPhrases(String sentence) { | |
if (sentence==null) | |
return null; | |
if (sentence.split(" ").length ==1) { // this is a word, return empty | |
//queryArrayStr.add( sentence); | |
return null; | |
} | |
if (sentence.length()>100) | |
return null ; // too long of a sentence to parse | |
System.out.println("About to parse: "+sentence); | |
List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); | |
if (groupedChunks.size()<1) | |
return null; | |
List<ParseTreeChunk> vPhrases = groupedChunks.get(1); | |
return vPhrases; | |
} | |
public List<List<ParseTreeChunk>> getPhrasesOfAllTypes(String sentence) { | |
if (sentence==null) | |
return null; | |
if (sentence.split(" ").length ==1) { // this is a word, return empty | |
//queryArrayStr.add( sentence); | |
return null; | |
} | |
if (sentence.length()>200) | |
return null ; // too long of a sentence to parse | |
System.out.println("About to parse: "+sentence); | |
List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); | |
if (groupedChunks.size()<1) | |
return null; | |
return groupedChunks; | |
} | |
// forms phrases from text which are candidate expressions for events lookup | |
public List<String> extractNounPhraseProductNameCandidate(String sentence) { | |
List<String> queryArrayStr = new ArrayList<String>(); | |
if (sentence.split(" ").length ==1) { // this is a word, return empty | |
//queryArrayStr.add( sentence); | |
return queryArrayStr; | |
} | |
String quoted1 = StringUtils.substringBetween(sentence, "\"", "\""); | |
String quoted2 = StringUtils.substringBetween(sentence, "\'", "\'"); | |
List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); | |
if (groupedChunks.size()<1) | |
return queryArrayStr; | |
List<ParseTreeChunk> nPhrases = groupedChunks.get(0); | |
for (ParseTreeChunk ch : nPhrases) { | |
String query = ""; | |
int size = ch.getLemmas().size(); | |
boolean phraseBeingFormed = false; | |
for (int i = 0; i < size; i++) { | |
if ((ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i) | |
.startsWith("J") || ch.getPOSs().get(i).startsWith("CD") ) ) | |
// && StringUtils.isAlpha(ch.getLemmas().get(i))) | |
{ | |
query += ch.getLemmas().get(i) + " "; | |
phraseBeingFormed = true; | |
} else | |
if ((ch.getPOSs().get(i).startsWith("PR") || ch.getPOSs().get(i).startsWith("IN") || ch.getPOSs().get(i).startsWith("TO") ) | |
&& phraseBeingFormed ) | |
break; | |
else if (ch.getPOSs().get(i).startsWith("DT") || ch.getPOSs().get(i).startsWith("CC")) | |
continue; | |
} | |
query = query.trim(); | |
int len = query.split(" ").length; | |
if (len > 5 || len < 2) // too long or too short | |
continue; | |
/* | |
if (len < 4 && len>1) { // every word should start with capital | |
String[] qs = query.split(" "); | |
boolean bAccept = true; | |
for (String w : qs) { | |
if (w.toLowerCase().equals(w)) // idf only two words then | |
// has to be person name, | |
// title or geo | |
// location | |
bAccept = false; | |
} | |
if (!bAccept) | |
continue; | |
} | |
*/ | |
// individual word, possibly a frequent word | |
// if len==1 do nothing | |
query = query.trim(); | |
queryArrayStr.add(query); | |
} | |
/* | |
if (queryArrayStr.size() < 1) { // release constraints on NP down to 2 | |
// keywords | |
for (ParseTreeChunk ch : nPhrases) { | |
String query = ""; | |
int size = ch.getLemmas().size(); | |
for (int i = 0; i < size; i++) { | |
if (ch.getPOSs().get(i).startsWith("N") | |
|| ch.getPOSs().get(i).startsWith("J")) { | |
query += ch.getLemmas().get(i) + " "; | |
} | |
} | |
query = query.trim(); | |
int len = query.split(" ").length; | |
if (len < 2) | |
continue; | |
query = TextProcessor.fastTokenize(query.toLowerCase(), false) | |
.toString().replace('[', ' ').replace(']', ' ').trim(); | |
if (query.length() > 6) | |
queryArrayStr.add(query); | |
} | |
} | |
//queryArrayStr = Utils | |
// .removeDuplicatesFromQueries(queryArrayStr); | |
if (quoted1 != null | |
&& ((quoted1.length() > 5 && !stopList.isCommonWord(quoted1)) || quoted1 | |
.length() > 10)) | |
queryArrayStr.add(quoted1); | |
if (quoted2 != null | |
&& ((quoted2.length() > 5 && !stopList.isCommonWord(quoted2)) || quoted2 | |
.length() > 10)) | |
queryArrayStr.add(quoted2); | |
*/ return queryArrayStr; | |
} | |
public static void main(String[] args){ | |
String sent = "Appliances and Kitchen Gadgets - CNET Blogs"; | |
//"The tablet phenomenon turns Silicon Valley upside down - SiliconValley.com"; | |
List<String> res = new PhraseProcessor().extractNounPhraseProductNameCandidate(sent); | |
System.out.println(res); | |
} | |
} |