blob: 132416bb13cd4ccf96eeb321c2234b5482b87a00 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.summarization.meta;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Hashtable;
import java.util.List;
import opennlp.summarization.Score;
import opennlp.summarization.Sentence;
import opennlp.summarization.lexicalchaining.LexicalChain;
import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
import opennlp.summarization.preprocess.DefaultDocProcessor;
import opennlp.summarization.textrank.TextRankSummarizer;
import java.util.logging.*;
import opennlp.summarization.DocProcessor;
/*
* A summarizer that combines results from the text rank algorithm and the lexical chaining algorithm.
* It runs both algorithm and uses the the lexical chains to identify the main topics and relative importance
* and the text rank to pick sentences from lexical chains.
*/
public class MetaSummarizer{
DocProcessor dp ;
TextRankSummarizer textRank;
LexicalChainingSummarizer lcs;
String sentFragModel = "resources/en-sent.bin";
public MetaSummarizer(String posModelFile) throws Exception
{
Logger.getAnonymousLogger().info("Initializing Meta Summarizer");
dp = new DefaultDocProcessor(sentFragModel);
textRank = new TextRankSummarizer();
lcs = new LexicalChainingSummarizer(dp, posModelFile);
}
//An Utility method to sort the ranked sentences by sentence order.
private List<Score> order(List<Score> s)
{
Collections.sort(s, new Comparator<Score>()
{
@Override
public int compare(Score o1, Score o2) {
// TODO Auto-generated method stub
return o1.getSentId() - o2.getSentId();
}
});
return s;
}
// Rank sentences by merging the scores from lexical chaining and text rank..
// maxWords -1 indicates rank all sentences..
public int getBestSent(LexicalChain l, Hashtable<Integer, Score> pageRankScores)
{
double bestScore = 0; int bestStr=-1;
for(Sentence s : l.getSentences())
{
Score sc = pageRankScores.get(new Integer(s.getSentId()));
if(sc!=null && sc.getScore() > bestScore)
{
bestScore = sc.getScore();
bestStr = sc.getSentId();
}
}
return bestStr;
}
public List<Score> rankSentences(String article, List<Sentence> sent, int maxWords)
{
List<LexicalChain> lc = lcs.buildLexicalChains(article, sent);
Collections.sort(lc);
Hashtable<Integer, Score> sentScores = new Hashtable<Integer, Score>();
try{
List<Score> scores = textRank.rankSentences(article, sent, dp, article.length());
for(Score s: scores) sentScores.put(s.getSentId(), s);
}catch(Exception ex){
ex.printStackTrace();
}
Hashtable<Sentence, Boolean> summSents = new Hashtable<Sentence,Boolean>();
List<Score> finalSc = new ArrayList<Score>();
int currWordCnt = 0;
for(int i=lc.size()-1;i>=0;i--)
{
LexicalChain l = lc.get(i);
boolean added =false;
while(l.getSentences().size()>0)
{
int sentId = getBestSent(l, sentScores);
if(sentId == -1) break;
Sentence s = sent.get(sentId);
//Sentence already added, try again..
if(summSents.containsKey(s))
l.getSentences().remove(s);
else{
finalSc.add(sentScores.get(s.getSentId()));
summSents.put(s, true);
currWordCnt += s.getWordCnt();
break;
}
}
if(maxWords>0 && currWordCnt>maxWords) break;
}
order(finalSc);
return finalSc;
}
//Default Summarization using only lexical chains..
public String summarize(String article, int maxWords)
{
//Build lexical Chains..
List<Sentence> sent = dp.getSentencesFromStr(article);
List<Score>finalSc = rankSentences(article, sent, maxWords);
StringBuilder sb = new StringBuilder();
for(int i=0;i<finalSc.size();i++)
{
sb.append(sent.get(finalSc.get(i).getSentId()).toString().trim() +".. ");
}
// Pick sentences
return sb.toString();
}
public static void main(String[] args)
{
try{
String posModelFileName = "./resources/en-pos-maxent.bin";
String sentFragModel = "resources/en-sent.bin";
DefaultDocProcessor dp =new DefaultDocProcessor(sentFragModel);
Logger l = Logger.getAnonymousLogger();
MetaSummarizer lcs = new MetaSummarizer(posModelFileName);
String article = dp.docToString("test/tax.txt");
long strt = System.currentTimeMillis();
System.out.println(lcs.summarize(article, 50));
System.out.println(System.currentTimeMillis() - strt);
article = dp.docToString("test/houston-rep-nopara.txt");
strt = System.currentTimeMillis();
System.out.println(lcs.summarize(article, 50));
System.out.println(System.currentTimeMillis() - strt);
article = dp.docToString("gunman.txt");
strt = System.currentTimeMillis();
System.out.println(lcs.summarize(article, 50));
System.out.println(System.currentTimeMillis() - strt);
article = dp.docToString("satellite.txt");
strt = System.currentTimeMillis();
System.out.println(lcs.summarize(article, 50));
System.out.println(System.currentTimeMillis() - strt);
}catch(Exception ex){
ex.printStackTrace();
}
}
}