blob: 218f8f56e6a76d85166e4ef13d3a85bb12252add [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.summarization.lexicalchaining;
import java.util.*;
import java.util.logging.Logger;
import opennlp.summarization.DocProcessor;
import opennlp.summarization.Score;
import opennlp.summarization.Sentence;
import opennlp.summarization.Summarizer;
import opennlp.summarization.preprocess.DefaultDocProcessor;
/*
* Implements the algorithm outlined in - "Summarization Using Lexical Chains" by R. Berzilay et al.
* The algorithm is based on so extracting so called lexical chains - a set of sentences in the article
* that share a word that are very closely related. Thus the longest chain represents the most important
* topic and so forth. A summary can then be formed by identifying the most important lexical chains
* and "pulling" out sentences from them.
*/
public class LexicalChainingSummarizer implements Summarizer{
private POSTagger tagger;
private DocProcessor dp;
private WordRelationshipDetermination wordRel;
private Logger log;
public LexicalChainingSummarizer(DocProcessor dp, String posModelFile) throws Exception
{
wordRel = new WordRelationshipDetermination();
tagger = new OpenNLPPOSTagger(dp, posModelFile);
log = Logger.getLogger("LexicalChainingSummarizer");
}
//Build Lexical chains..
public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent)
{
// POS tag article
Hashtable<String, List<LexicalChain>> chains = new Hashtable<String, List<LexicalChain>>();
List<LexicalChain> lc = new ArrayList<LexicalChain>();
// Build lexical chains
// For each sentence
for(Sentence currSent : sent)
{
log.info(currSent.getStringVal());
String taggedSent = tagger.getTaggedString(currSent.getStringVal());
List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
// For each noun
for(String noun : nouns)
{
int chainsAddCnt = 0;
// Loop through each LC
for(LexicalChain l: lc)
{
try{
WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start)>7);
// Is the noun an exact match to one of the current LCs (Strong relation)
// Add sentence to chain
if(rel.relation == WordRelation.STRONG_RELATION)
{
addToChain(rel.dest, l, chains, currSent);
if(currSent.getSentId() - l.last > 10)
{
l.occurences++; l.start = currSent.getSentId();
}
chainsAddCnt++;
}
else if(rel.relation == WordRelation.MED_RELATION)
{
// Add sentence to chain if it is 7 sentences away from start of chain
addToChain(rel.dest, l, chains, currSent);
chainsAddCnt++;
//If greater than 7 we will add it but call it a new occurence of the lexical chain...
if(currSent.getSentId() - l.start > 7)
{
l.occurences++;
l.start = currSent.getSentId();
}
}
else if(rel.relation == WordRelation.WEAK_RELATION)
{
if(currSent.getSentId() - l.start <= 3)
{
addToChain(rel.dest, l, chains, currSent);
chainsAddCnt++;
}
}
}catch(Exception ex){}
// add sentence and update last occurence..
//chaincnt++
// else 1 hop-relation in Wordnet (weak relation)
// Add sentence to chain if it is 3 sentences away from start of chain
//chaincnt++
// End loop LC
}
//Could not add the word to any existing list.. Start a new lexical chain with the word..
if(chainsAddCnt==0)
{
List<Word> senses = wordRel.getWordSenses(noun);
for(Word w : senses)
{
LexicalChain newLc = new LexicalChain();
newLc.start = currSent.getSentId();
addToChain(w, newLc, chains, currSent);
lc.add(newLc);
}
}
if(lc.size()> 20)
purge(lc, currSent.getSentId(), sent.size());
}
//End sentence
}
// diambiguateAndCleanChains(lc, chains);
// Calculate score
// Length of chain * homogeneity
//sort LC by strength..
return lc;
}
/*
* A way to manage the number of lexical chains generated. Expire very small lexical chains ..
* Takes care to only remove small chains that were added "long back"
*/
private void purge(List<LexicalChain> lc, int sentId, int totSents) {
//Do nothing for the first 50 sentences..
if(lc.size()<20 ) return;
Collections.sort(lc);
double min = lc.get(0).score();
double max = lc.get(lc.size()-1).score();
int cutOff = Math.max(3, (int)min);
Hashtable<String, Boolean> words = new Hashtable<String, Boolean>();
List<LexicalChain> toRem = new ArrayList<LexicalChain>();
for(int i=lc.size()-1; i>=0;i--)
{
LexicalChain l = lc.get(i);
if(l.score() < cutOff && (sentId - l.last) > totSents/3)// && containsAllWords(words, l.word))
toRem.add(l);
//A different sense and added long back..
else if(words.containsKey(l.getWord().get(0).getLexicon()) && (sentId - l.start) > totSents/10)
toRem.add(l);
else
{
//Check if this is from a word with different sense..
for(Word w: l.word)
words.put(w.getLexicon(), new Boolean(true));
}
}
for(LexicalChain l: toRem)
lc.remove(l);
}
private boolean containsAllWords(Hashtable<Word, Boolean> words,
List<Word> word) {
boolean ret = true;
for(Word w: word)
if(!words.containsKey(word)) return false;
return ret;
}
private void addToChain(Word noun, LexicalChain l,
Hashtable<String, List<LexicalChain>> chains, Sentence sent) {
l.addWord(noun);
l.addSentence(sent);
l.last = sent.getSentId();
if(!chains.contains(noun))
chains.put(noun.getLexicon(), new ArrayList<LexicalChain>());
chains.get(noun.getLexicon()).add(l);
}
POSTagger getTagger() {
return tagger;
}
void setTagger(POSTagger tagger) {
this.tagger = tagger;
}
@Override
public String summarize(String article, DocProcessor dp, int maxWords) {
List<Sentence> sent = dp.getSentencesFromStr(article);
List<LexicalChain> lc = buildLexicalChains(article, sent);
Collections.sort(lc);
int summSize=0;
List<Sentence>summ = new ArrayList<Sentence>();
StringBuffer sb = new StringBuffer();
for(int i=0;i<lc.size();i++)
{
for(int j=0;j<lc.size();j++)
{
Sentence candidate = lc.get(i).sentences.get(j);
if(!summ.contains(candidate))
{
summ.add(candidate);
sb.append(candidate.getStringVal());
summSize += candidate.getWordCnt();
break;
}
}
if(summSize>=maxWords) break;
}
return sb.toString();
}
}