| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.summarization.meta; |
| |
| import java.io.FileInputStream; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.Hashtable; |
| import java.util.List; |
| import java.util.logging.Logger; |
| |
| import opennlp.summarization.Score; |
| import opennlp.summarization.Sentence; |
| import opennlp.summarization.lexicalchaining.LexicalChain; |
| import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer; |
| import opennlp.summarization.preprocess.DefaultDocProcessor; |
| import opennlp.summarization.textrank.TextRankSummarizer; |
| |
| import opennlp.summarization.DocProcessor; |
| |
| /** |
| * A summarizer that combines results from the text rank algorithm and the lexical chaining algorithm. |
| * It runs both algorithm and uses the lexical chains to identify the main topics and relative importance |
| * and the text rank to pick sentences from lexical chains. |
| */ |
| public class MetaSummarizer { |
| private final DocProcessor dp; |
| private final TextRankSummarizer textRank; |
| private final LexicalChainingSummarizer lcs; |
| private static final String SENT_FRAG_MODEL = "/en-sent.bin"; |
| |
| public MetaSummarizer(String posModelFile) throws Exception { |
| Logger.getAnonymousLogger().info("Initializing Meta Summarizer"); |
| dp = new DefaultDocProcessor(MetaSummarizer.class.getResourceAsStream(SENT_FRAG_MODEL)); |
| textRank = new TextRankSummarizer(); |
| lcs = new LexicalChainingSummarizer(dp, new FileInputStream(posModelFile)); |
| } |
| |
| // A utility method to sort the ranked sentences by sentence order. |
| private List<Score> order(List<Score> s) { |
| s.sort(Comparator.comparingInt(Score::getSentId)); |
| return s; |
| } |
| |
| // Rank sentences by merging the scores from lexical chaining and text rank. |
| // maxWords -1 indicates rank all sentences. |
| public int getBestSent(LexicalChain l, Hashtable<Integer, Score> pageRankScores) { |
| double bestScore = 0; int bestStr = -1; |
| for(Sentence s : l.getSentences()) { |
| Score sc = pageRankScores.get(s.getSentId()); |
| if(sc!=null && sc.getScore() > bestScore) |
| { |
| bestScore = sc.getScore(); |
| bestStr = sc.getSentId(); |
| } |
| } |
| return bestStr; |
| } |
| |
| public List<Score> rankSentences(String article, List<Sentence> sent, int maxWords) { |
| List<LexicalChain> lc = lcs.buildLexicalChains(article, sent); |
| Collections.sort(lc); |
| Hashtable<Integer, Score> sentScores = new Hashtable<>(); |
| try{ |
| List<Score> scores = textRank.rankSentences(article, sent, dp, article.length()); |
| for(Score s: scores) sentScores.put(s.getSentId(), s); |
| }catch(Exception ex){ |
| ex.printStackTrace(); |
| } |
| |
| Hashtable<Sentence, Boolean> summSents = new Hashtable<>(); |
| List<Score> finalSc = new ArrayList<>(); |
| int currWordCnt = 0; |
| for(int i=lc.size()-1;i>=0;i--) |
| { |
| LexicalChain l = lc.get(i); |
| while(l.getSentences().size()>0) |
| { |
| int sentId = getBestSent(l, sentScores); |
| if(sentId == -1) break; |
| |
| Sentence s = sent.get(sentId); |
| |
| //Sentence already added, try again.. |
| if(summSents.containsKey(s)) |
| l.getSentences().remove(s); |
| else{ |
| finalSc.add(sentScores.get(s.getSentId())); |
| summSents.put(s, true); |
| currWordCnt += s.getWordCnt(); |
| break; |
| } |
| } |
| if(maxWords>0 && currWordCnt>maxWords) break; |
| } |
| |
| order(finalSc); |
| return finalSc; |
| } |
| |
| //Default Summarization using only lexical chains.. |
| public String summarize(String article, int maxWords) { |
| //Build lexical Chains.. |
| List<Sentence> sent = dp.getSentencesFromStr(article); |
| List<Score> finalSc = rankSentences(article, sent, maxWords); |
| |
| StringBuilder sb = new StringBuilder(); |
| for (Score score : finalSc) { |
| sb.append(sent.get(score.getSentId()).toString().trim()).append(".. "); |
| } |
| // Pick sentences |
| return sb.toString(); |
| } |
| |
| public static void main(String[] args) { |
| try{ |
| String posModelFileName = "en-pos-maxent.bin"; |
| String sentFragModel = "en-sent.bin"; |
| DefaultDocProcessor dp =new DefaultDocProcessor(MetaSummarizer.class.getResourceAsStream(sentFragModel)); |
| MetaSummarizer lcs = new MetaSummarizer(posModelFileName); |
| String article = dp.docToString("test/tax.txt"); |
| long strt = System.currentTimeMillis(); |
| System.out.println(lcs.summarize(article, 50)); |
| System.out.println(System.currentTimeMillis() - strt); |
| |
| article = dp.docToString("test/houston-rep-nopara.txt"); |
| strt = System.currentTimeMillis(); |
| System.out.println(lcs.summarize(article, 50)); |
| System.out.println(System.currentTimeMillis() - strt); |
| |
| article = dp.docToString("gunman.txt"); |
| strt = System.currentTimeMillis(); |
| System.out.println(lcs.summarize(article, 50)); |
| System.out.println(System.currentTimeMillis() - strt); |
| |
| article = dp.docToString("satellite.txt"); |
| strt = System.currentTimeMillis(); |
| System.out.println(lcs.summarize(article, 50)); |
| System.out.println(System.currentTimeMillis() - strt); |
| } catch(Exception ex) { |
| ex.printStackTrace(); |
| } |
| } |
| } |