summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java - opennlp-sandbox - Git at Google

 /*
  	* Licensed to the Apache Software Foundation (ASF) under one or more
  	* contributor license agreements. See the NOTICE file distributed with
  	* this work for additional information regarding copyright ownership.
  	* The ASF licenses this file to You under the Apache License, Version 2.0
  	* (the "License"); you may not use this file except in compliance with
  	* the License. You may obtain a copy of the License at
  	*
  	* http://www.apache.org/licenses/LICENSE-2.0
  	*
  	* Unless required by applicable law or agreed to in writing, software
  	* distributed under the License is distributed on an "AS IS" BASIS,
  	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  	* See the License for the specific language governing permissions and
  	* limitations under the License.
 */

 package opennlp.summarization.lexicalchaining;

 import java.util.*;
 import java.util.logging.Logger;

 import opennlp.summarization.DocProcessor;
 import opennlp.summarization.Score;
 import opennlp.summarization.Sentence;
 import opennlp.summarization.Summarizer;
 import opennlp.summarization.preprocess.DefaultDocProcessor;

 /*
  * Implements the algorithm outlined in - "Summarization Using Lexical Chains" by R. Berzilay et al.
  * The algorithm is based on so extracting so called lexical chains - a set of sentences in the article
  * that share a word that are very closely related. Thus the longest chain represents the most important
  * topic and so forth. A summary can then be formed by identifying the most important lexical chains
  * and "pulling" out sentences from them.
  */
 public class LexicalChainingSummarizer implements Summarizer{

 	private POSTagger tagger;
 	private DocProcessor dp;
 	private WordRelationshipDetermination wordRel;
 	private Logger log;
 	public LexicalChainingSummarizer(DocProcessor dp, String posModelFile) throws Exception
 	{
 		wordRel = new WordRelationshipDetermination();
 		tagger = new OpenNLPPOSTagger(dp, posModelFile);
 		log = Logger.getLogger("LexicalChainingSummarizer");
 	}

 	//Build Lexical chains..
 	public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent)
 	{
 		// POS tag article
 		Hashtable<String, List<LexicalChain>> chains = new Hashtable<String, List<LexicalChain>>();
 		List<LexicalChain> lc = new ArrayList<LexicalChain>();
 		// Build lexical chains
 			// For each sentence
 			for(Sentence currSent : sent)
 			{
 				log.info(currSent.getStringVal());
 				String taggedSent = tagger.getTaggedString(currSent.getStringVal());
 				  List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
 				  // 	For each noun
 				  for(String noun : nouns)
 				  {
 					  int chainsAddCnt = 0;
 					  	//  Loop through each LC
 					    for(LexicalChain l: lc)
 					    {
 					    	try{
 					    		WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start)>7);
 					    		//  Is the noun an exact match to one of the current LCs (Strong relation)
 					    		//  Add sentence to chain
 					    		if(rel.relation == WordRelation.STRONG_RELATION)
 					    		{
 					    			addToChain(rel.dest, l, chains, currSent);
 					    			if(currSent.getSentId() - l.last > 10)
 					    				{
 					    					l.occurences++; l.start = currSent.getSentId();
 					    				}
 					    			chainsAddCnt++;
 					    		}
 						    	else if(rel.relation == WordRelation.MED_RELATION)
 						    	{
 						    		//  Add sentence to chain if it is 7 sentences away from start of chain
 							    		addToChain(rel.dest, l, chains, currSent);
 							    		chainsAddCnt++;
 							       //If greater than 7 we will add it but call it a new occurence of the lexical chain...
 							    	if(currSent.getSentId() - l.start > 7)
 						    		{
 						    			l.occurences++;
 						    			l.start = currSent.getSentId();
 						    		}
 						    	}
 								else if(rel.relation == WordRelation.WEAK_RELATION)
 								{
 						    		if(currSent.getSentId() - l.start <= 3)
 						    		{
 							    		addToChain(rel.dest, l, chains, currSent);
 							    		chainsAddCnt++;
 						    		}
 								}
 					    	}catch(Exception ex){}
 							// add sentence and update last occurence..
 						    //chaincnt++
 						 //  else 1 hop-relation in Wordnet (weak relation)
 							//  Add sentence to chain if it is 3 sentences away from start of chain
 					  	   //chaincnt++
 					  // End loop LC
 					    }
 					    //Could not add the word to any existing list.. Start a new lexical chain with the word..
 					    if(chainsAddCnt==0)
 					    {
 					    	List<Word> senses = wordRel.getWordSenses(noun);
 				    		for(Word w : senses)
 				    		{
 						    	LexicalChain newLc = new LexicalChain();
 						    	newLc.start = currSent.getSentId();
 						    	addToChain(w, newLc, chains, currSent);
 				    			lc.add(newLc);
 				    		}
 					    }
 					    if(lc.size()> 20)
 					    	purge(lc, currSent.getSentId(), sent.size());
 				  }
 		   //End sentence
 			}

 //			diambiguateAndCleanChains(lc, chains);
 		// Calculate score
 			//	Length of chain * homogeneity
 		//sort LC by strength..
 		return lc;
 	}

 	/*
 	 * A way to manage the number of lexical chains generated. Expire very small lexical chains ..
 	 * Takes care to only remove small chains that were added "long back"
 	 */
 	private void purge(List<LexicalChain> lc, int sentId, int totSents) {
 		//Do nothing for the first 50 sentences..
 		if(lc.size()<20 ) return;

 		Collections.sort(lc);
 		double min = lc.get(0).score();
 		double max = lc.get(lc.size()-1).score();

 		int cutOff = Math.max(3, (int)min);
 		Hashtable<String, Boolean> words = new Hashtable<String, Boolean>();
 		List<LexicalChain> toRem = new ArrayList<LexicalChain>();
 		for(int i=lc.size()-1; i>=0;i--)
 		{
 			LexicalChain l = lc.get(i);
 			if(l.score() < cutOff && (sentId - l.last) > totSents/3)//	 && containsAllWords(words, l.word))
 				toRem.add(l);
 			//A different sense and added long back..
 			else if(words.containsKey(l.getWord().get(0).getLexicon()) && (sentId - l.start) > totSents/10)
 				toRem.add(l);
 			else
 			{
 				//Check if this is from a word with different sense..
 				for(Word w: l.word)
 					words.put(w.getLexicon(), new Boolean(true));
 			}
 		}

 		for(LexicalChain l: toRem)
 			lc.remove(l);
 	}

 	private boolean containsAllWords(Hashtable<Word, Boolean> words,
 			List<Word> word) {
 		boolean ret = true;
 		for(Word w: word)
 			if(!words.containsKey(word)) return false;

 		return ret;
 	}

 	private void addToChain(Word noun, LexicalChain l,
 			Hashtable<String, List<LexicalChain>> chains, Sentence sent) {

 		l.addWord(noun);
 		l.addSentence(sent);
 		l.last = sent.getSentId();
 		if(!chains.contains(noun))
 			chains.put(noun.getLexicon(), new ArrayList<LexicalChain>());
 		chains.get(noun.getLexicon()).add(l);
 	}

 	POSTagger getTagger() {
 		return tagger;
 	}

 	void setTagger(POSTagger tagger) {
 		this.tagger = tagger;
 	}

 	@Override
 	public String summarize(String article, DocProcessor dp, int maxWords) {
 		List<Sentence> sent = dp.getSentencesFromStr(article);
 		List<LexicalChain> lc = buildLexicalChains(article, sent);
 		Collections.sort(lc);
 		int summSize=0;
 		List<Sentence>summ = new ArrayList<Sentence>();
 		StringBuffer sb = new StringBuffer();
 		for(int i=0;i<lc.size();i++)
 		{
 			for(int j=0;j<lc.size();j++)
 			{
 				Sentence candidate = lc.get(i).sentences.get(j);
 				if(!summ.contains(candidate))
 				{
 					summ.add(candidate);
 					sb.append(candidate.getStringVal());
 					summSize += candidate.getWordCnt();
 					break;
 				}
 			}
 			if(summSize>=maxWords) break;
 		}
 		return sb.toString();
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.summarization.lexicalchaining;

	import java.util.*;
	import java.util.logging.Logger;

	import opennlp.summarization.DocProcessor;
	import opennlp.summarization.Score;
	import opennlp.summarization.Sentence;
	import opennlp.summarization.Summarizer;
	import opennlp.summarization.preprocess.DefaultDocProcessor;

	/*
	* Implements the algorithm outlined in - "Summarization Using Lexical Chains" by R. Berzilay et al.
	* The algorithm is based on so extracting so called lexical chains - a set of sentences in the article
	* that share a word that are very closely related. Thus the longest chain represents the most important
	* topic and so forth. A summary can then be formed by identifying the most important lexical chains
	* and "pulling" out sentences from them.
	*/
	public class LexicalChainingSummarizer implements Summarizer{

	private POSTagger tagger;
	private DocProcessor dp;
	private WordRelationshipDetermination wordRel;
	private Logger log;
	public LexicalChainingSummarizer(DocProcessor dp, String posModelFile) throws Exception
	{
	wordRel = new WordRelationshipDetermination();
	tagger = new OpenNLPPOSTagger(dp, posModelFile);
	log = Logger.getLogger("LexicalChainingSummarizer");
	}

	//Build Lexical chains..
	public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent)
	{
	// POS tag article
	Hashtable<String, List<LexicalChain>> chains = new Hashtable<String, List<LexicalChain>>();
	List<LexicalChain> lc = new ArrayList<LexicalChain>();
	// Build lexical chains
	// For each sentence
	for(Sentence currSent : sent)
	{
	log.info(currSent.getStringVal());
	String taggedSent = tagger.getTaggedString(currSent.getStringVal());
	List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
	// For each noun
	for(String noun : nouns)
	{
	int chainsAddCnt = 0;
	// Loop through each LC
	for(LexicalChain l: lc)
	{
	try{
	WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start)>7);
	// Is the noun an exact match to one of the current LCs (Strong relation)
	// Add sentence to chain
	if(rel.relation == WordRelation.STRONG_RELATION)
	{
	addToChain(rel.dest, l, chains, currSent);
	if(currSent.getSentId() - l.last > 10)
	{
	l.occurences++; l.start = currSent.getSentId();
	}
	chainsAddCnt++;
	}
	else if(rel.relation == WordRelation.MED_RELATION)
	{
	// Add sentence to chain if it is 7 sentences away from start of chain
	addToChain(rel.dest, l, chains, currSent);
	chainsAddCnt++;
	//If greater than 7 we will add it but call it a new occurence of the lexical chain...
	if(currSent.getSentId() - l.start > 7)
	{
	l.occurences++;
	l.start = currSent.getSentId();
	}
	}
	else if(rel.relation == WordRelation.WEAK_RELATION)
	{
	if(currSent.getSentId() - l.start <= 3)
	{
	addToChain(rel.dest, l, chains, currSent);
	chainsAddCnt++;
	}
	}
	}catch(Exception ex){}
	// add sentence and update last occurence..
	//chaincnt++
	// else 1 hop-relation in Wordnet (weak relation)
	// Add sentence to chain if it is 3 sentences away from start of chain
	//chaincnt++
	// End loop LC
	}
	//Could not add the word to any existing list.. Start a new lexical chain with the word..
	if(chainsAddCnt==0)
	{
	List<Word> senses = wordRel.getWordSenses(noun);
	for(Word w : senses)
	{
	LexicalChain newLc = new LexicalChain();
	newLc.start = currSent.getSentId();
	addToChain(w, newLc, chains, currSent);
	lc.add(newLc);
	}
	}
	if(lc.size()> 20)
	purge(lc, currSent.getSentId(), sent.size());
	}
	//End sentence
	}

	// diambiguateAndCleanChains(lc, chains);
	// Calculate score
	// Length of chain * homogeneity
	//sort LC by strength..
	return lc;
	}

	/*
	* A way to manage the number of lexical chains generated. Expire very small lexical chains ..
	* Takes care to only remove small chains that were added "long back"
	*/
	private void purge(List<LexicalChain> lc, int sentId, int totSents) {
	//Do nothing for the first 50 sentences..
	if(lc.size()<20 ) return;

	Collections.sort(lc);
	double min = lc.get(0).score();
	double max = lc.get(lc.size()-1).score();

	int cutOff = Math.max(3, (int)min);
	Hashtable<String, Boolean> words = new Hashtable<String, Boolean>();
	List<LexicalChain> toRem = new ArrayList<LexicalChain>();
	for(int i=lc.size()-1; i>=0;i--)
	{
	LexicalChain l = lc.get(i);
	if(l.score() < cutOff && (sentId - l.last) > totSents/3)// && containsAllWords(words, l.word))
	toRem.add(l);
	//A different sense and added long back..
	else if(words.containsKey(l.getWord().get(0).getLexicon()) && (sentId - l.start) > totSents/10)
	toRem.add(l);
	else
	{
	//Check if this is from a word with different sense..
	for(Word w: l.word)
	words.put(w.getLexicon(), new Boolean(true));
	}
	}

	for(LexicalChain l: toRem)
	lc.remove(l);
	}

	private boolean containsAllWords(Hashtable<Word, Boolean> words,
	List<Word> word) {
	boolean ret = true;
	for(Word w: word)
	if(!words.containsKey(word)) return false;

	return ret;
	}

	private void addToChain(Word noun, LexicalChain l,
	Hashtable<String, List<LexicalChain>> chains, Sentence sent) {

	l.addWord(noun);
	l.addSentence(sent);
	l.last = sent.getSentId();
	if(!chains.contains(noun))
	chains.put(noun.getLexicon(), new ArrayList<LexicalChain>());
	chains.get(noun.getLexicon()).add(l);
	}

	POSTagger getTagger() {
	return tagger;
	}

	void setTagger(POSTagger tagger) {
	this.tagger = tagger;
	}

	@Override
	public String summarize(String article, DocProcessor dp, int maxWords) {
	List<Sentence> sent = dp.getSentencesFromStr(article);
	List<LexicalChain> lc = buildLexicalChains(article, sent);
	Collections.sort(lc);
	int summSize=0;
	List<Sentence>summ = new ArrayList<Sentence>();
	StringBuffer sb = new StringBuffer();
	for(int i=0;i<lc.size();i++)
	{
	for(int j=0;j<lc.size();j++)
	{
	Sentence candidate = lc.get(i).sentences.get(j);
	if(!summ.contains(candidate))
	{
	summ.add(candidate);
	sb.append(candidate.getStringVal());
	summSize += candidate.getWordCnt();
	break;
	}
	}
	if(summSize>=maxWords) break;
	}
	return sb.toString();
	}

	}