| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.apps.relevanceVocabs; |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.commons.lang.StringUtils; |
| |
| import opennlp.tools.parser.Parse; |
| import opennlp.tools.textsimilarity.ParseTreeChunk; |
| import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; |
| import opennlp.tools.util.Span; |
| |
| public class PhraseProcessor { |
| |
| private final ParserChunker2MatcherProcessor nlProc = ParserChunker2MatcherProcessor.getInstance() ; |
| |
| public static boolean allChildNodesArePOSTags(Parse p) { |
| Parse[] subParses = p.getChildren(); |
| for (Parse subPars : subParses) |
| if (!subPars.isPosTag()) |
| return false; |
| return true; |
| } |
| |
| public ArrayList<String> getNounPhrases(Parse p) |
| { |
| ArrayList<String> nounphrases = new ArrayList<>(); |
| |
| Parse[] subparses = p.getChildren(); |
| for (Parse subpars : subparses) { |
| |
| if (subpars.getType().equals("NP") && allChildNodesArePOSTags(subpars)) { |
| Span _span = subpars.getSpan(); |
| nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd())); |
| } else if (!subpars.isPosTag()) |
| nounphrases.addAll(getNounPhrases(subpars)); |
| } |
| |
| return nounphrases; |
| } |
| |
| public ArrayList<String> getVerbPhrases(Parse p) |
| { |
| ArrayList<String> verbPhrases = new ArrayList<>(); |
| |
| Parse[] subparses = p.getChildren(); |
| for (Parse subpars : subparses) { |
| |
| if (subpars.getType().startsWith("VB") && allChildNodesArePOSTags(subpars)) { |
| Span _span = subpars.getSpan(); |
| verbPhrases.add(p.getText().substring(_span.getStart(), _span.getEnd())); |
| } else if (!subpars.isPosTag()) |
| verbPhrases.addAll(getNounPhrases(subpars)); |
| } |
| |
| return verbPhrases; |
| } |
| |
| // forms phrases from text which are candidate expressions for events lookup |
| public List<ParseTreeChunk> getVerbPhrases(String sentence) { |
| if (sentence==null) |
| return null; |
| if (sentence.split(" ").length ==1) { // this is a word, return empty |
| //queryArrayStr.add( sentence); |
| return null; |
| } |
| if (sentence.length()>100) |
| return null ; // too long of a sentence to parse |
| |
| System.out.println("About to parse: "+sentence); |
| List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); |
| if (groupedChunks.size()<1) |
| return null; |
| |
| return groupedChunks.get(1); |
| } |
| |
| public List<List<ParseTreeChunk>> getPhrasesOfAllTypes(String sentence) { |
| if (sentence==null) |
| return null; |
| if (sentence.split(" ").length ==1) { // this is a word, return empty |
| //queryArrayStr.add( sentence); |
| return null; |
| } |
| if (sentence.length()>200) |
| return null ; // too long of a sentence to parse |
| |
| System.out.println("About to parse: "+sentence); |
| List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); |
| if (groupedChunks.size()<1) |
| return null; |
| |
| return groupedChunks; |
| } |
| |
| // forms phrases from text which are candidate expressions for events lookup |
| public List<String> extractNounPhraseProductNameCandidate(String sentence) { |
| |
| List<String> queryArrayStr = new ArrayList<>(); |
| |
| if (sentence.split(" ").length ==1) { // this is a word, return empty |
| //queryArrayStr.add( sentence); |
| return queryArrayStr; |
| } |
| String quoted1 = StringUtils.substringBetween(sentence, "\"", "\""); |
| String quoted2 = StringUtils.substringBetween(sentence, "\'", "\'"); |
| List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); |
| if (groupedChunks.size()<1) |
| return queryArrayStr; |
| |
| List<ParseTreeChunk> nPhrases = groupedChunks.get(0); |
| |
| for (ParseTreeChunk ch : nPhrases) { |
| StringBuilder query = new StringBuilder(); |
| int size = ch.getLemmas().size(); |
| boolean phraseBeingFormed = false; |
| for (int i = 0; i < size; i++) { |
| if ((ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i) |
| .startsWith("J") || ch.getPOSs().get(i).startsWith("CD") ) ) |
| // && StringUtils.isAlpha(ch.getLemmas().get(i))) |
| { |
| query.append(ch.getLemmas().get(i)).append(" "); |
| phraseBeingFormed = true; |
| } else |
| if ((ch.getPOSs().get(i).startsWith("PR") || ch.getPOSs().get(i).startsWith("IN") || ch.getPOSs().get(i).startsWith("TO") ) |
| && phraseBeingFormed ) |
| break; |
| else if (ch.getPOSs().get(i).startsWith("DT") || ch.getPOSs().get(i).startsWith("CC")) |
| continue; |
| } |
| query = new StringBuilder(query.toString().trim()); |
| int len = query.toString().split(" ").length; |
| if (len > 5 || len < 2) // too long or too short |
| continue; |
| |
| /* |
| if (len < 4 && len>1) { // every word should start with capital |
| String[] qs = query.split(" "); |
| boolean bAccept = true; |
| for (String w : qs) { |
| if (w.toLowerCase().equals(w)) // idf only two words then |
| // has to be person name, |
| // title or geolocation |
| bAccept = false; |
| } |
| if (!bAccept) |
| continue; |
| } |
| */ |
| // individual word, possibly a frequent word |
| // if len==1 do nothing |
| |
| query = new StringBuilder(query.toString().trim()); |
| queryArrayStr.add(query.toString()); |
| |
| } |
| /* |
| if (queryArrayStr.size() < 1) { // release constraints on NP down to 2 |
| // keywords |
| for (ParseTreeChunk ch : nPhrases) { |
| String query = ""; |
| int size = ch.getLemmas().size(); |
| |
| for (int i = 0; i < size; i++) { |
| if (ch.getPOSs().get(i).startsWith("N") |
| || ch.getPOSs().get(i).startsWith("J")) { |
| query += ch.getLemmas().get(i) + " "; |
| } |
| } |
| query = query.trim(); |
| int len = query.split(" ").length; |
| if (len < 2) |
| continue; |
| |
| query = TextProcessor.fastTokenize(query.toLowerCase(), false) |
| .toString().replace('[', ' ').replace(']', ' ').trim(); |
| if (query.length() > 6) |
| queryArrayStr.add(query); |
| } |
| } |
| //queryArrayStr = Utils |
| // .removeDuplicatesFromQueries(queryArrayStr); |
| if (quoted1 != null |
| && ((quoted1.length() > 5 && !stopList.isCommonWord(quoted1)) || quoted1 |
| .length() > 10)) |
| queryArrayStr.add(quoted1); |
| if (quoted2 != null |
| && ((quoted2.length() > 5 && !stopList.isCommonWord(quoted2)) || quoted2 |
| .length() > 10)) |
| queryArrayStr.add(quoted2); |
| */ |
| return queryArrayStr; |
| } |
| |
| public static void main(String[] args){ |
| String sent = "Appliances and Kitchen Gadgets - CNET Blogs"; |
| //"The tablet phenomenon turns Silicon Valley upside down - SiliconValley.com"; |
| List<String> res = new PhraseProcessor().extractNounPhraseProductNameCandidate(sent); |
| System.out.println(res); |
| } |
| } |