summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements. See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.summarization.textrank;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.util.List;

 import opennlp.summarization.*;
 import opennlp.summarization.preprocess.IDFWordWeight;
 import opennlp.summarization.preprocess.StopWords;
 import opennlp.summarization.preprocess.WordWeight;

 /**
  * Implements the TextRank algorithm by Mihalcea et al.
  * <p>
  * This basically applies the page rank algorithm to a graph where each sentence is a node
  * and a connection between sentences indicates that a word is shared between them.
  * It returns a ranking of sentences where the highest rank means most important etc.
  * Currently, only stemming is done to the words; a more sophisticated way might use a
  * resource like Wordnet to match synonyms etc.
  */
 public class TextRank {
   private final StopWords sw;
   private String article;
   private Hashtable<Integer, List<Integer>> links;
   private List<String> sentences = new ArrayList<>();
   private List<String> processedSent = new ArrayList<>();
   private final WordWeight wordWt;
   private static final int NO_OF_IT = 100;
   private final double maxErr = 0.1;
   private DocProcessor docProc;

   private final double title_wt = 0;

   // private Hashtable<Integer, String[]> wordsInSent;

   // DAMPING FACTOR..
   private static final double DF = 0.15;
   private static final boolean HIGHER_TITLE_WEIGHT = true;
   private static final double TITLE_WRD_WT = 2d;

   public TextRank(DocProcessor dp) {
     sw = new StopWords();
     setLinks(new Hashtable<>());
     processedSent = new ArrayList<>();
     docProc = dp;
     String resources = "./resources";
     wordWt = IDFWordWeight.getInstance(resources + "/idf.csv");
   }

   public TextRank(StopWords sw, WordWeight wordWts) {
     this.sw = sw;
     this.wordWt = wordWts;
   }

   // Returns similarity of two sentences. Wrd wts contains tf-idf of the
   // words..
   public double getWeightedSimilarity(String sent1, String sent2,
                                       Hashtable<String, Double> wrdWts) {
     String[] words1 = sent1.trim().split("\\s+");
     String[] words2 = sent2.trim().split("\\s+");
     double wordsInCommon = 0;
     Hashtable<String, Boolean> dups = new Hashtable<>();
     for (String s : words1) {
       String currWrd1 = s.trim();
       // skip over duplicate words of sentence
       if (dups.get(currWrd1) == null) {
         dups.put(currWrd1, true);
         for (String value : words2) {
           if (!sw.isStopWord(currWrd1) && !currWrd1.isEmpty()
                   && s.equals(value)) {
             Double wt;

             wt = wrdWts.get(currWrd1);
             if (wt != null)
               wordsInCommon += wt;
             else
               wordsInCommon++;
           }
         }
       }
     }
     return (wordsInCommon) /  (words1.length  +  words2.length);
   }

   // Gets the current score from the list of scores passed ...
   public double getScoreFrom(List<Score> scores, int id) {
     for (Score s : scores) {
       if (s.getSentId() == id)
         return s.getScore();
     }
     return 1;
   }

   // This method runs the page rank algorithm for the sentences.
   // TR(Vi) = (1-d) + d * sigma over neighbors Vj( wij/sigma over k neighbor
   // of j(wjk) * PR(Vj) )
   public List<Score> getTextRankScore(List<Score> rawScores,
                                       List<String> sentences, Hashtable<String, Double> wrdWts) {
     List<Score> currWtScores = new ArrayList<>();
     // Start with equal weights for all sentences
     for (int i = 0; i < rawScores.size(); i++) {
       Score ns = new Score();
       ns.setSentId(rawScores.get(i).getSentId());
       ns.setScore((1 - title_wt) / (rawScores.size()));// this.getSimilarity();
       currWtScores.add(ns);
     }
     // currWtScores.get(0).score = this.title_wt;

     // Page rank..
     for (int i = 0; i < NO_OF_IT; i++) {
       double totErr = 0;
       List<Score> newWtScores = new ArrayList<>();

       // Update the scores for the current iteration..
       for (Score rs : rawScores) {
         int sentId = rs.getSentId();
         Score ns = new Score();
         ns.setSentId(sentId);

         List<Integer> neighbors = getLinks().get(sentId);
         double sum = 0;
         if (neighbors != null) {
           for (Integer j : neighbors) {
             // sum += getCurrentScore(rawScores,
             // sentId)/(getCurrentScore(rawScores, neigh)) *
             // getCurrentScore(currWtScores, neigh);
             double wij = this.getWeightedSimilarity(sentences
                     .get(sentId), sentences.get(j), wrdWts);
             double sigmawjk = getScoreFrom(rawScores, j);
             double txtRnkj = getScoreFrom(currWtScores, j);
             sum += wij / sigmawjk * txtRnkj;
           }
         }
         ns.setScore((1d - DF) + sum * DF);// * rs.score
         totErr += ns.getScore() - getScoreFrom(rawScores, sentId);
         newWtScores.add(ns);
       }
       currWtScores = newWtScores;
       if (i > 2 && totErr / rawScores.size() < maxErr)
         break;
     }

     for (Score s : currWtScores) {
       s.setScore(s.getScore() * getScoreFrom(rawScores, s.getSentId()));
     }
     return currWtScores;
   }

   // Raw score is sigma wtsimilarity of neighbors.
   // Used in the denominator of the Text rank formula.
   public List<Score> getNeighborsSigmaWtSim(List<String> sentences,
                                             Hashtable<String, List<Integer>> iidx, Hashtable<String, Double> wts) {
     List<Score> allScores = new ArrayList<>();

     for (int i = 0; i < sentences.size(); i++) {
       String nextSent = sentences.get(i);
       String[] words = nextSent.trim().split("\\s+");
       Score s = new Score();
       s.setSentId(i);

       for (String word : words) {
         String currWrd = docProc.getStemmer().stem(word).toString(); //stemmer.toString();

         List<Integer> otherSents = iidx.get(currWrd);
         if (otherSents == null)
           continue;

         List<Integer> processed = new ArrayList<>();
         for (int idx : otherSents) {
           if (idx != i && !processed.contains(idx)) {
             double currS = getWeightedSimilarity(sentences.get(i),
                     sentences.get(idx), wts);
             s.setScore(s.getScore() + currS);

             if (currS > 0) {
               addLink(i, idx);
             }
             processed.add(idx);
           }
         }
       }
       allScores.add(s);
     }
     return allScores;
   }

   public List<Score> getWeightedScores(List<Score> rawScores,
                                        List<String> sentences, Hashtable<String, Double> wordWts) {
     List<Score> weightedScores = this.getTextRankScore(rawScores,
             sentences, wordWts);
     Collections.sort(weightedScores);
     return weightedScores;
   }

   private Hashtable<String, Double> toWordWtHashtable(WordWeight wwt,
                                                       Hashtable<String, List<Integer>> iidx) {
     Hashtable<String, Double> wrdWt = new Hashtable<>();
     Enumeration<String> keys = iidx.keys();
     while (keys.hasMoreElements()) {
       String key = keys.nextElement();
       wrdWt.put(key, wwt.getWordWeight(key));
     }
     return wrdWt;
   }

   public List<Score> getRankedSentences(String doc, List<String> sentences,
                                         Hashtable<String, List<Integer>> iidx, List<String> processedSent) {
     this.sentences = sentences;
     this.processedSent = processedSent;

     Hashtable<String, Double> wrdWts = toWordWtHashtable(this.wordWt, iidx);// new
     // Hashtable<String,
     // Double>();

     if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) {
       String sent = getSentences().get(0);
       String[] wrds = sent.trim().split("\\s+");
       for (String wrd : wrds)
         wrdWts.put(wrd, TITLE_WRD_WT);
     }

     List<Score> rawScores = getNeighborsSigmaWtSim(getSentences(), iidx, wrdWts);
     return getWeightedScores(rawScores, getSentences(), wrdWts);
   }

   // Set a link between two sentences..
   private void addLink(int i, int idx) {
     List<Integer> endNodes = getLinks().get(i);
     if (endNodes == null)
       endNodes = new ArrayList<>();
     endNodes.add(idx);
     getLinks().put(i, endNodes);
   }

   public void setSentences(List<String> sentences) {
     this.sentences = sentences;
   }

   public List<String> getSentences() {
     return sentences;
   }

   public void setArticle(String article) {
     this.article = article;
   }

   public String getArticle() {
     return article;
   }

   private void setLinks(Hashtable<Integer, List<Integer>> links) {
     this.links = links;
   }

   public Hashtable<Integer, List<Integer>> getLinks() {
     return links;
   }
 }

 /*
  * public double getScore(String sent1, String sent2, boolean toPrint) {
  * String[] words1 = sent1.split("\\s+"); String[] words2 = sent2.split("\\s+");
  * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
  * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
  * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+=
  * wordWt.getWordWeight(words1[i]); } } } return ((double)wordsInCommon) /
  * (Math.log(1+words1.length) + Math.log(1+words2.length)); }
  */
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.summarization.textrank;

	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Enumeration;
	import java.util.Hashtable;
	import java.util.List;

	import opennlp.summarization.*;
	import opennlp.summarization.preprocess.IDFWordWeight;
	import opennlp.summarization.preprocess.StopWords;
	import opennlp.summarization.preprocess.WordWeight;

	/**
	* Implements the TextRank algorithm by Mihalcea et al.
	* <p>
	* This basically applies the page rank algorithm to a graph where each sentence is a node
	* and a connection between sentences indicates that a word is shared between them.
	* It returns a ranking of sentences where the highest rank means most important etc.
	* Currently, only stemming is done to the words; a more sophisticated way might use a
	* resource like Wordnet to match synonyms etc.
	*/
	public class TextRank {
	private final StopWords sw;
	private String article;
	private Hashtable<Integer, List<Integer>> links;
	private List<String> sentences = new ArrayList<>();
	private List<String> processedSent = new ArrayList<>();
	private final WordWeight wordWt;
	private static final int NO_OF_IT = 100;
	private final double maxErr = 0.1;
	private DocProcessor docProc;

	private final double title_wt = 0;

	// private Hashtable<Integer, String[]> wordsInSent;

	// DAMPING FACTOR..
	private static final double DF = 0.15;
	private static final boolean HIGHER_TITLE_WEIGHT = true;
	private static final double TITLE_WRD_WT = 2d;

	public TextRank(DocProcessor dp) {
	sw = new StopWords();
	setLinks(new Hashtable<>());
	processedSent = new ArrayList<>();
	docProc = dp;
	String resources = "./resources";
	wordWt = IDFWordWeight.getInstance(resources + "/idf.csv");
	}

	public TextRank(StopWords sw, WordWeight wordWts) {
	this.sw = sw;
	this.wordWt = wordWts;
	}

	// Returns similarity of two sentences. Wrd wts contains tf-idf of the
	// words..
	public double getWeightedSimilarity(String sent1, String sent2,
	Hashtable<String, Double> wrdWts) {
	String[] words1 = sent1.trim().split("\\s+");
	String[] words2 = sent2.trim().split("\\s+");
	double wordsInCommon = 0;
	Hashtable<String, Boolean> dups = new Hashtable<>();
	for (String s : words1) {
	String currWrd1 = s.trim();
	// skip over duplicate words of sentence
	if (dups.get(currWrd1) == null) {
	dups.put(currWrd1, true);
	for (String value : words2) {
	if (!sw.isStopWord(currWrd1) && !currWrd1.isEmpty()
	&& s.equals(value)) {
	Double wt;

	wt = wrdWts.get(currWrd1);
	if (wt != null)
	wordsInCommon += wt;
	else
	wordsInCommon++;
	}
	}
	}
	}
	return (wordsInCommon) / (words1.length + words2.length);
	}

	// Gets the current score from the list of scores passed ...
	public double getScoreFrom(List<Score> scores, int id) {
	for (Score s : scores) {
	if (s.getSentId() == id)
	return s.getScore();
	}
	return 1;
	}

	// This method runs the page rank algorithm for the sentences.
	// TR(Vi) = (1-d) + d * sigma over neighbors Vj( wij/sigma over k neighbor
	// of j(wjk) * PR(Vj) )
	public List<Score> getTextRankScore(List<Score> rawScores,
	List<String> sentences, Hashtable<String, Double> wrdWts) {
	List<Score> currWtScores = new ArrayList<>();
	// Start with equal weights for all sentences
	for (int i = 0; i < rawScores.size(); i++) {
	Score ns = new Score();
	ns.setSentId(rawScores.get(i).getSentId());
	ns.setScore((1 - title_wt) / (rawScores.size()));// this.getSimilarity();
	currWtScores.add(ns);
	}
	// currWtScores.get(0).score = this.title_wt;

	// Page rank..
	for (int i = 0; i < NO_OF_IT; i++) {
	double totErr = 0;
	List<Score> newWtScores = new ArrayList<>();

	// Update the scores for the current iteration..
	for (Score rs : rawScores) {
	int sentId = rs.getSentId();
	Score ns = new Score();
	ns.setSentId(sentId);

	List<Integer> neighbors = getLinks().get(sentId);
	double sum = 0;
	if (neighbors != null) {
	for (Integer j : neighbors) {
	// sum += getCurrentScore(rawScores,
	// sentId)/(getCurrentScore(rawScores, neigh)) *
	// getCurrentScore(currWtScores, neigh);
	double wij = this.getWeightedSimilarity(sentences
	.get(sentId), sentences.get(j), wrdWts);
	double sigmawjk = getScoreFrom(rawScores, j);
	double txtRnkj = getScoreFrom(currWtScores, j);
	sum += wij / sigmawjk * txtRnkj;
	}
	}
	ns.setScore((1d - DF) + sum * DF);// * rs.score
	totErr += ns.getScore() - getScoreFrom(rawScores, sentId);
	newWtScores.add(ns);
	}
	currWtScores = newWtScores;
	if (i > 2 && totErr / rawScores.size() < maxErr)
	break;
	}

	for (Score s : currWtScores) {
	s.setScore(s.getScore() * getScoreFrom(rawScores, s.getSentId()));
	}
	return currWtScores;
	}

	// Raw score is sigma wtsimilarity of neighbors.
	// Used in the denominator of the Text rank formula.
	public List<Score> getNeighborsSigmaWtSim(List<String> sentences,
	Hashtable<String, List<Integer>> iidx, Hashtable<String, Double> wts) {
	List<Score> allScores = new ArrayList<>();

	for (int i = 0; i < sentences.size(); i++) {
	String nextSent = sentences.get(i);
	String[] words = nextSent.trim().split("\\s+");
	Score s = new Score();
	s.setSentId(i);

	for (String word : words) {
	String currWrd = docProc.getStemmer().stem(word).toString(); //stemmer.toString();

	List<Integer> otherSents = iidx.get(currWrd);
	if (otherSents == null)
	continue;

	List<Integer> processed = new ArrayList<>();
	for (int idx : otherSents) {
	if (idx != i && !processed.contains(idx)) {
	double currS = getWeightedSimilarity(sentences.get(i),
	sentences.get(idx), wts);
	s.setScore(s.getScore() + currS);

	if (currS > 0) {
	addLink(i, idx);
	}
	processed.add(idx);
	}
	}
	}
	allScores.add(s);
	}
	return allScores;
	}

	public List<Score> getWeightedScores(List<Score> rawScores,
	List<String> sentences, Hashtable<String, Double> wordWts) {
	List<Score> weightedScores = this.getTextRankScore(rawScores,
	sentences, wordWts);
	Collections.sort(weightedScores);
	return weightedScores;
	}

	private Hashtable<String, Double> toWordWtHashtable(WordWeight wwt,
	Hashtable<String, List<Integer>> iidx) {
	Hashtable<String, Double> wrdWt = new Hashtable<>();
	Enumeration<String> keys = iidx.keys();
	while (keys.hasMoreElements()) {
	String key = keys.nextElement();
	wrdWt.put(key, wwt.getWordWeight(key));
	}
	return wrdWt;
	}

	public List<Score> getRankedSentences(String doc, List<String> sentences,
	Hashtable<String, List<Integer>> iidx, List<String> processedSent) {
	this.sentences = sentences;
	this.processedSent = processedSent;

	Hashtable<String, Double> wrdWts = toWordWtHashtable(this.wordWt, iidx);// new
	// Hashtable<String,
	// Double>();

	if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) {
	String sent = getSentences().get(0);
	String[] wrds = sent.trim().split("\\s+");
	for (String wrd : wrds)
	wrdWts.put(wrd, TITLE_WRD_WT);
	}

	List<Score> rawScores = getNeighborsSigmaWtSim(getSentences(), iidx, wrdWts);
	return getWeightedScores(rawScores, getSentences(), wrdWts);
	}

	// Set a link between two sentences..
	private void addLink(int i, int idx) {
	List<Integer> endNodes = getLinks().get(i);
	if (endNodes == null)
	endNodes = new ArrayList<>();
	endNodes.add(idx);
	getLinks().put(i, endNodes);
	}

	public void setSentences(List<String> sentences) {
	this.sentences = sentences;
	}

	public List<String> getSentences() {
	return sentences;
	}

	public void setArticle(String article) {
	this.article = article;
	}

	public String getArticle() {
	return article;
	}

	private void setLinks(Hashtable<Integer, List<Integer>> links) {
	this.links = links;
	}

	public Hashtable<Integer, List<Integer>> getLinks() {
	return links;
	}
	}

	/*
	* public double getScore(String sent1, String sent2, boolean toPrint) {
	* String[] words1 = sent1.split("\\s+"); String[] words2 = sent2.split("\\s+");
	* double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
	* j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
	* !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+=
	* wordWt.getWordWeight(words1[i]); } } } return ((double)wordsInCommon) /
	* (Math.log(1+words1.length) + Math.log(1+words2.length)); }
	*/