blob: e60c5747d498b748e22842615edb846f9c0538aa [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.summarization.textrank;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.summarization.*;
import opennlp.summarization.preprocess.DefaultDocProcessor;
import opennlp.summarization.preprocess.IDFWordWeight;
import opennlp.summarization.preprocess.WordWeight;
/*
* A wrapper around the text rank algorithm. This class
* a) Sets up the data for the TextRank class
* b) Takes the ranked sentences and does some basic rearranging (e.g. ordering) to provide a more reasonable summary.
*/
public class TextRankSummarizer implements Summarizer
{
//An optional file to store idf of words. If idf is not available it uses a default equal weight for all words.
private String idfFile = "resources/idf.csv";
public TextRankSummarizer() throws Exception
{
}
/*Sets up data and calls the TextRank algorithm..*/
public List<Score> rankSentences(String doc, List<Sentence> sentences,
DocProcessor dp, int maxWords )
{
try {
//Rank sentences
TextRank summ = new TextRank(dp);
List<String> sentenceStrL = new ArrayList<String>();
List<String> processedSent = new ArrayList<String>();
Hashtable<String, List<Integer>> iidx = new Hashtable<String, List<Integer>>();
// dp.getSentences(sentences, sentenceStrL, iidx, processedSent);
for(Sentence s : sentences){
sentenceStrL.add(s.getStringVal());
String stemmedSent = s.stem();
processedSent.add(stemmedSent);
String[] wrds = stemmedSent.split(" ");
for(String w: wrds)
{
if(iidx.get(w)!=null)
iidx.get(w).add(s.getSentId());
else{
List<Integer> l = new ArrayList<Integer>();
l.add(s.getSentId());
iidx.put(w, l);
}
}
}
WordWeight wordWt = new IDFWordWeight(idfFile);////new
List<Score> finalScores = summ.getRankedSentences(doc, sentenceStrL, iidx, processedSent);
List<String> sentenceStrList = summ.getSentences();
// SentenceClusterer clust = new SentenceClusterer();
// clust.runClusterer(doc, summ.processedSent);
Hashtable<Integer,List<Integer>> links= summ.getLinks();
for(int i=0;i<sentences.size();i++)
{
Sentence st = sentences.get(i);
//Add links..
List<Integer> currLnks = links.get(i);
if(currLnks==null) continue;
for(int j=0;j<currLnks.size();j++)
{
if(j<i) st.addLink(sentences.get(j));
}
}
for(int i=0;i<finalScores.size();i++)
{
Score s = finalScores.get(i);
Sentence st = sentences.get(s.getSentId());
st.setPageRankScore(s);
}
List<Score> reRank = finalScores;//reRank(sentences, finalScores, iidx, wordWt, maxWords);
return reRank;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
//Returns the summary as a string.
@Override
public String summarize(String article, DocProcessor dp, int maxWords) {
List<Sentence> sentences = dp.getSentencesFromStr(article);
List<Score> scores = this.rankSentences(article, sentences, dp, maxWords);
return scores2String(sentences, scores, maxWords);
}
/* Use the page rank scores to determine the summary.*/
public String scores2String(List<Sentence> sentences, List<Score> scores, int maxWords)
{
StringBuffer b = new StringBuffer();
// for(int i=0;i< min(maxWords, scores.size()-1);i++)
int i=0;
while(b.length()< maxWords && i< scores.size())
{
String sent = sentences.get(scores.get(i).getSentId()).getStringVal();
b.append(sent + scores.get(i));
i++;
}
return b.toString();
}
}