blob: 0d2ba009a5aea908df67316f91e800c1264f998f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.apps.relevanceVocabs;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import opennlp.tools.parser.Parse;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.TextProcessor;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
import opennlp.tools.util.Span;
public class PhraseProcessor {
private ParserChunker2MatcherProcessor nlProc = ParserChunker2MatcherProcessor.getInstance() ;
public static boolean allChildNodesArePOSTags(Parse p)
{
Parse[] subParses = p.getChildren();
for (int pi = 0; pi < subParses.length; pi++)
if (!((Parse) subParses[pi]).isPosTag())
return false;
return true;
}
public ArrayList<String> getNounPhrases(Parse p)
{
ArrayList<String> nounphrases = new ArrayList<String>();
Parse[] subparses = p.getChildren();
for (int pi = 0; pi < subparses.length; pi++)
{
if (subparses[pi].getType().equals("NP") && allChildNodesArePOSTags(subparses[pi]))
{
Span _span = subparses[pi].getSpan();
nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
}
else if (!((Parse) subparses[pi]).isPosTag())
nounphrases.addAll(getNounPhrases(subparses[pi]));
}
return nounphrases;
}
public ArrayList<String> getVerbPhrases(Parse p)
{
ArrayList<String> verbPhrases = new ArrayList<String>();
Parse[] subparses = p.getChildren();
for (int pi = 0; pi < subparses.length; pi++)
{
if (subparses[pi].getType().startsWith("VB") && allChildNodesArePOSTags(subparses[pi]))
{
Span _span = subparses[pi].getSpan();
verbPhrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
}
else if (!((Parse) subparses[pi]).isPosTag())
verbPhrases.addAll(getNounPhrases(subparses[pi]));
}
return verbPhrases;
}
// forms phrases from text which are candidate expressions for events lookup
public List<ParseTreeChunk> getVerbPhrases(String sentence) {
if (sentence==null)
return null;
if (sentence.split(" ").length ==1) { // this is a word, return empty
//queryArrayStr.add( sentence);
return null;
}
if (sentence.length()>100)
return null ; // too long of a sentence to parse
System.out.println("About to parse: "+sentence);
List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence);
if (groupedChunks.size()<1)
return null;
List<ParseTreeChunk> vPhrases = groupedChunks.get(1);
return vPhrases;
}
public List<List<ParseTreeChunk>> getPhrasesOfAllTypes(String sentence) {
if (sentence==null)
return null;
if (sentence.split(" ").length ==1) { // this is a word, return empty
//queryArrayStr.add( sentence);
return null;
}
if (sentence.length()>200)
return null ; // too long of a sentence to parse
System.out.println("About to parse: "+sentence);
List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence);
if (groupedChunks.size()<1)
return null;
return groupedChunks;
}
// forms phrases from text which are candidate expressions for events lookup
public List<String> extractNounPhraseProductNameCandidate(String sentence) {
List<String> queryArrayStr = new ArrayList<String>();
if (sentence.split(" ").length ==1) { // this is a word, return empty
//queryArrayStr.add( sentence);
return queryArrayStr;
}
String quoted1 = StringUtils.substringBetween(sentence, "\"", "\"");
String quoted2 = StringUtils.substringBetween(sentence, "\'", "\'");
List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence);
if (groupedChunks.size()<1)
return queryArrayStr;
List<ParseTreeChunk> nPhrases = groupedChunks.get(0);
for (ParseTreeChunk ch : nPhrases) {
String query = "";
int size = ch.getLemmas().size();
boolean phraseBeingFormed = false;
for (int i = 0; i < size; i++) {
if ((ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i)
.startsWith("J") || ch.getPOSs().get(i).startsWith("CD") ) )
// && StringUtils.isAlpha(ch.getLemmas().get(i)))
{
query += ch.getLemmas().get(i) + " ";
phraseBeingFormed = true;
} else
if ((ch.getPOSs().get(i).startsWith("PR") || ch.getPOSs().get(i).startsWith("IN") || ch.getPOSs().get(i).startsWith("TO") )
&& phraseBeingFormed )
break;
else if (ch.getPOSs().get(i).startsWith("DT") || ch.getPOSs().get(i).startsWith("CC"))
continue;
}
query = query.trim();
int len = query.split(" ").length;
if (len > 5 || len < 2) // too long or too short
continue;
/*
if (len < 4 && len>1) { // every word should start with capital
String[] qs = query.split(" ");
boolean bAccept = true;
for (String w : qs) {
if (w.toLowerCase().equals(w)) // idf only two words then
// has to be person name,
// title or geo
// location
bAccept = false;
}
if (!bAccept)
continue;
}
*/
// individual word, possibly a frequent word
// if len==1 do nothing
query = query.trim();
queryArrayStr.add(query);
}
/*
if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
// keywords
for (ParseTreeChunk ch : nPhrases) {
String query = "";
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
|| ch.getPOSs().get(i).startsWith("J")) {
query += ch.getLemmas().get(i) + " ";
}
}
query = query.trim();
int len = query.split(" ").length;
if (len < 2)
continue;
query = TextProcessor.fastTokenize(query.toLowerCase(), false)
.toString().replace('[', ' ').replace(']', ' ').trim();
if (query.length() > 6)
queryArrayStr.add(query);
}
}
//queryArrayStr = Utils
// .removeDuplicatesFromQueries(queryArrayStr);
if (quoted1 != null
&& ((quoted1.length() > 5 && !stopList.isCommonWord(quoted1)) || quoted1
.length() > 10))
queryArrayStr.add(quoted1);
if (quoted2 != null
&& ((quoted2.length() > 5 && !stopList.isCommonWord(quoted2)) || quoted2
.length() > 10))
queryArrayStr.add(quoted2);
*/ return queryArrayStr;
}
public static void main(String[] args){
String sent = "Appliances and Kitchen Gadgets - CNET Blogs";
//"The tablet phenomenon turns Silicon Valley upside down - SiliconValley.com";
List<String> res = new PhraseProcessor().extractNounPhraseProductNameCandidate(sent);
System.out.println(res);
}
}