OPENNLP-752 Added the summarizer contribution. Thanks to Ram Soma for contributing it.
diff --git a/summarizer/src/main/java/opennlp/summarization/DocProcessor.java b/summarizer/src/main/java/opennlp/summarization/DocProcessor.java
new file mode 100644
index 0000000..c65fba5
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/DocProcessor.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+import java.util.List;
+
+import opennlp.tools.stemmer.Stemmer;
+
+/*
+ * A document processor abstracts a lot of the underlying complexities of parsing the document and
+ * preparing it (e.g. stemming, stop word removal) from the summarization algorithm. The current package
+ * supports sentence extraction based algorithms. Thus extracting Sentences from the text is the
+ * first step and the basis for the algorithms.
+ */
+public interface DocProcessor {
+ /* Extract sentences from a string representing an article.*/
+ public List<Sentence> getSentencesFromStr(String text) ;
+ /* Utility method to parse out words from a string.*/
+ public String[] getWords(String sent);
+ /* Provide a stemmer to stem words*/
+ public Stemmer getStemmer();
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/Score.java b/summarizer/src/main/java/opennlp/summarization/Score.java
new file mode 100755
index 0000000..7cbd067
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/Score.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+/*
+ * Utility class to store the score of a sentence for ranking sentences within a document.
+ */
+public class Score implements Comparable<Score>
+{
+ int sentId;
+ public double score;
+
+ public Score()
+ {
+ score = 0;
+ }
+
+ public int getSentId(){
+ return sentId;
+ }
+
+ public double getScore()
+ {
+ return score;
+ }
+
+ public void setScore(double score)
+ {
+ this.score = score;
+ }
+
+ public void setSentId(int sentId)
+ {
+ this.sentId = sentId;
+ }
+
+ public int compareTo(Score o)
+ {
+
+ if(o.score > score) return 1;
+ else if (o.score < score) return -1;
+ return 0;
+ }
+
+ public String toString()
+ {
+ return sentId +" "+score;
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java
new file mode 100755
index 0000000..6ad1d22
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+import java.text.BreakIterator;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Locale;
+
+import opennlp.summarization.preprocess.PorterStemmer;
+import opennlp.summarization.preprocess.StopWords;
+
+/*
+ * A representation of a sentence geared toward pagerank and summarization.
+ */
+public class Sentence {
+ //sentId is always position of sentence in doc..
+ private int sentId;
+ private String stringVal, procStringVal;
+ private Score pageRankScore;
+ private int paragraph;
+ private int paraPos;
+ private boolean hasQuote;
+ private double wordWt = 0;
+ private int wordCnt;
+
+ private List<Sentence> links;
+ private PorterStemmer stemmer;
+
+ public Sentence(){
+ links = new ArrayList<Sentence>();
+ }
+
+ public Sentence(int id){
+ this();
+ this.sentId = id;
+ }
+
+ public void setSentId(int sentId) {
+ this.sentId = sentId;
+ }
+
+ public int getSentId() {
+ return sentId;
+ }
+
+ public void setPageRankScore(Score pageRankScore) {
+ this.pageRankScore = pageRankScore;
+ }
+
+ public Score getPageRankScore() {
+ return pageRankScore;
+ }
+
+ public void setParagraph(int paragraph) {
+ this.paragraph = paragraph;
+ }
+
+ public int getParagraph() {
+ return paragraph;
+ }
+
+ public void setParaPos(int paraPos) {
+ this.paraPos = paraPos;
+ }
+
+ public int getParaPos() {
+ return paraPos;
+ }
+
+ public void setStringVal(String stringVal) {
+ this.stringVal = stringVal;
+ if(stringVal.contains("\"")) this.hasQuote = true;
+ this.wordCnt = calcWrdCnt(stringVal);
+ }
+
+ private int calcWrdCnt(String stringVal2) {
+ int ret = 0;
+ StopWords sw = StopWords.getInstance();
+ String[] wrds = stringVal.split(" ");
+ for(String wrd: wrds){
+ if(!sw.isStopWord(wrd)&&!wrd.startsWith("'")&&!wrd.equals(".")&&!wrd.equals("?"))
+ ret++;
+ }
+ return ret;
+ }
+
+ public String getStringVal() {
+ return stringVal;
+ }
+
+ public void addLink(Sentence s)
+ {
+ this.links.add(s);
+ }
+
+ public List<Sentence> getLinks()
+ {
+ return this.links;
+ }
+
+ public String toString()
+ {
+ return this.stringVal ;//+ "("+ this.paragraph +", "+this.paraPos+")";
+ }
+
+ public void setWordWt(double wordWt) {
+ this.wordWt = wordWt;
+ }
+
+ public double getWordWt() {
+ return wordWt;
+ }
+
+ public int getWordCnt()
+ {
+ return wordCnt==0? this.getStringVal().split(" ").length: wordCnt;
+ }
+
+ //Should add an article id to the sentence class.. For now returns true if the ids are the same..
+ public boolean equals(Object o){
+ if(! (o instanceof Sentence)) return false;
+
+ Sentence s = (Sentence)o;
+ if(s.sentId == this.sentId) return true;
+ return false;
+ }
+
+ static final String space=" ";
+ public String stem() {
+ PorterStemmer stemmer = new PorterStemmer();
+ StopWords sw = StopWords.getInstance();
+
+ BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
+ int wrdStrt = 0;
+ StringBuffer b = new StringBuffer();
+ wrdItr.setText(stringVal);
+ for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE;
+ wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
+ {
+ String word = this.getStringVal().substring(wrdStrt, wrdEnd);//words[i].trim();
+ word.replaceAll("\"|'","");
+
+ //Skip stop words and stem the word..
+ if(sw.isStopWord(word)) continue;
+ stemmer.stem(word);
+ b.append(stemmer.toString());
+ b.append(space);
+ }
+ // TODO Auto-generated method stub
+ return b.toString();
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/Summarizer.java b/summarizer/src/main/java/opennlp/summarization/Summarizer.java
new file mode 100644
index 0000000..3b9d006
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/Summarizer.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+public interface Summarizer {
+ public String summarize(String article, DocProcessor dp, int maxWords);
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java
new file mode 100644
index 0000000..f8302e5
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/*
+ * Use the lexical chaining algorithm to extract keywords.
+ */
+public class LexChainingKeywordExtractor {
+
+ //Simple logic to pull out the keyword based on longest lexical chains..
+ public List<String> getKeywords(List<LexicalChain> lexicalChains, int noOfKeywrds){
+ Collections.sort(lexicalChains);
+ List<String> ret = new ArrayList<String>();
+ for(int i=0;i<Math.min(lexicalChains.size(), noOfKeywrds);i++)
+ {
+ List<Word> words = lexicalChains.get(i).getWord();
+ if(words.size()>0 &&!ret.contains(words.get(0).getLexicon())){
+ ret.add(words.get(0).getLexicon());
+ }
+ }
+ return ret;
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java
new file mode 100644
index 0000000..6e92bd5
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.summarization.Sentence;
+
+
+public class LexicalChain implements Comparable<LexicalChain>{
+ List<Word> word;
+
+ List<Sentence> sentences;
+
+ int start, last;
+ int score;
+ int occurences=1;
+
+ public LexicalChain()
+ {
+ word = new ArrayList<Word>();
+ sentences = new ArrayList<Sentence>();
+ }
+
+ public double score()
+ {
+ return length() ;//* homogeneity();
+ }
+
+ public int length(){
+ return word.size();
+ }
+
+ public float homogeneity()
+ {
+ return (1.0f - (float)occurences/(float)length());
+ }
+
+ public void addWord(Word w)
+ {
+ word.add(w);
+ }
+
+ public void addSentence(Sentence sent)
+ {
+ if(!sentences.contains(sent))
+ sentences.add(sent);
+ }
+
+ public List<Word> getWord()
+ {
+ return word;
+ }
+
+ public List<Sentence>getSentences()
+ {
+ return this.sentences;
+ }
+
+ @Override
+ public int compareTo(LexicalChain o) {
+ double diff = (score() - o.score());
+ return diff ==0? 0: diff > 0 ?1:-1;
+ }
+
+ @Override
+ public boolean equals(Object o){
+ return super.equals(o);
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java
new file mode 100755
index 0000000..218f8f5
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.*;
+import java.util.logging.Logger;
+
+import opennlp.summarization.DocProcessor;
+import opennlp.summarization.Score;
+import opennlp.summarization.Sentence;
+import opennlp.summarization.Summarizer;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+/*
+ * Implements the algorithm outlined in - "Summarization Using Lexical Chains" by R. Berzilay et al.
+ * The algorithm is based on so extracting so called lexical chains - a set of sentences in the article
+ * that share a word that are very closely related. Thus the longest chain represents the most important
+ * topic and so forth. A summary can then be formed by identifying the most important lexical chains
+ * and "pulling" out sentences from them.
+ */
+public class LexicalChainingSummarizer implements Summarizer{
+
+ private POSTagger tagger;
+ private DocProcessor dp;
+ private WordRelationshipDetermination wordRel;
+ private Logger log;
+ public LexicalChainingSummarizer(DocProcessor dp, String posModelFile) throws Exception
+ {
+ wordRel = new WordRelationshipDetermination();
+ tagger = new OpenNLPPOSTagger(dp, posModelFile);
+ log = Logger.getLogger("LexicalChainingSummarizer");
+ }
+
+ //Build Lexical chains..
+ public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent)
+ {
+ // POS tag article
+ Hashtable<String, List<LexicalChain>> chains = new Hashtable<String, List<LexicalChain>>();
+ List<LexicalChain> lc = new ArrayList<LexicalChain>();
+ // Build lexical chains
+ // For each sentence
+ for(Sentence currSent : sent)
+ {
+ log.info(currSent.getStringVal());
+ String taggedSent = tagger.getTaggedString(currSent.getStringVal());
+ List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
+ // For each noun
+ for(String noun : nouns)
+ {
+ int chainsAddCnt = 0;
+ // Loop through each LC
+ for(LexicalChain l: lc)
+ {
+ try{
+ WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start)>7);
+ // Is the noun an exact match to one of the current LCs (Strong relation)
+ // Add sentence to chain
+ if(rel.relation == WordRelation.STRONG_RELATION)
+ {
+ addToChain(rel.dest, l, chains, currSent);
+ if(currSent.getSentId() - l.last > 10)
+ {
+ l.occurences++; l.start = currSent.getSentId();
+ }
+ chainsAddCnt++;
+ }
+ else if(rel.relation == WordRelation.MED_RELATION)
+ {
+ // Add sentence to chain if it is 7 sentences away from start of chain
+ addToChain(rel.dest, l, chains, currSent);
+ chainsAddCnt++;
+ //If greater than 7 we will add it but call it a new occurence of the lexical chain...
+ if(currSent.getSentId() - l.start > 7)
+ {
+ l.occurences++;
+ l.start = currSent.getSentId();
+ }
+ }
+ else if(rel.relation == WordRelation.WEAK_RELATION)
+ {
+ if(currSent.getSentId() - l.start <= 3)
+ {
+ addToChain(rel.dest, l, chains, currSent);
+ chainsAddCnt++;
+ }
+ }
+ }catch(Exception ex){}
+ // add sentence and update last occurence..
+ //chaincnt++
+ // else 1 hop-relation in Wordnet (weak relation)
+ // Add sentence to chain if it is 3 sentences away from start of chain
+ //chaincnt++
+ // End loop LC
+ }
+ //Could not add the word to any existing list.. Start a new lexical chain with the word..
+ if(chainsAddCnt==0)
+ {
+ List<Word> senses = wordRel.getWordSenses(noun);
+ for(Word w : senses)
+ {
+ LexicalChain newLc = new LexicalChain();
+ newLc.start = currSent.getSentId();
+ addToChain(w, newLc, chains, currSent);
+ lc.add(newLc);
+ }
+ }
+ if(lc.size()> 20)
+ purge(lc, currSent.getSentId(), sent.size());
+ }
+ //End sentence
+ }
+
+// diambiguateAndCleanChains(lc, chains);
+ // Calculate score
+ // Length of chain * homogeneity
+ //sort LC by strength..
+ return lc;
+ }
+
+ /*
+ * A way to manage the number of lexical chains generated. Expire very small lexical chains ..
+ * Takes care to only remove small chains that were added "long back"
+ */
+ private void purge(List<LexicalChain> lc, int sentId, int totSents) {
+ //Do nothing for the first 50 sentences..
+ if(lc.size()<20 ) return;
+
+ Collections.sort(lc);
+ double min = lc.get(0).score();
+ double max = lc.get(lc.size()-1).score();
+
+ int cutOff = Math.max(3, (int)min);
+ Hashtable<String, Boolean> words = new Hashtable<String, Boolean>();
+ List<LexicalChain> toRem = new ArrayList<LexicalChain>();
+ for(int i=lc.size()-1; i>=0;i--)
+ {
+ LexicalChain l = lc.get(i);
+ if(l.score() < cutOff && (sentId - l.last) > totSents/3)// && containsAllWords(words, l.word))
+ toRem.add(l);
+ //A different sense and added long back..
+ else if(words.containsKey(l.getWord().get(0).getLexicon()) && (sentId - l.start) > totSents/10)
+ toRem.add(l);
+ else
+ {
+ //Check if this is from a word with different sense..
+ for(Word w: l.word)
+ words.put(w.getLexicon(), new Boolean(true));
+ }
+ }
+
+ for(LexicalChain l: toRem)
+ lc.remove(l);
+ }
+
+ private boolean containsAllWords(Hashtable<Word, Boolean> words,
+ List<Word> word) {
+ boolean ret = true;
+ for(Word w: word)
+ if(!words.containsKey(word)) return false;
+
+ return ret;
+ }
+
+ private void addToChain(Word noun, LexicalChain l,
+ Hashtable<String, List<LexicalChain>> chains, Sentence sent) {
+
+ l.addWord(noun);
+ l.addSentence(sent);
+ l.last = sent.getSentId();
+ if(!chains.contains(noun))
+ chains.put(noun.getLexicon(), new ArrayList<LexicalChain>());
+ chains.get(noun.getLexicon()).add(l);
+ }
+
+ POSTagger getTagger() {
+ return tagger;
+ }
+
+ void setTagger(POSTagger tagger) {
+ this.tagger = tagger;
+ }
+
+ @Override
+ public String summarize(String article, DocProcessor dp, int maxWords) {
+ List<Sentence> sent = dp.getSentencesFromStr(article);
+ List<LexicalChain> lc = buildLexicalChains(article, sent);
+ Collections.sort(lc);
+ int summSize=0;
+ List<Sentence>summ = new ArrayList<Sentence>();
+ StringBuffer sb = new StringBuffer();
+ for(int i=0;i<lc.size();i++)
+ {
+ for(int j=0;j<lc.size();j++)
+ {
+ Sentence candidate = lc.get(i).sentences.get(j);
+ if(!summ.contains(candidate))
+ {
+ summ.add(candidate);
+ sb.append(candidate.getStringVal());
+ summSize += candidate.getWordCnt();
+ break;
+ }
+ }
+ if(summSize>=maxWords) break;
+ }
+ return sb.toString();
+ }
+
+}
+
+
\ No newline at end of file
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java
new file mode 100644
index 0000000..4fd602a
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.logging.Logger;
+
+import javax.annotation.processing.Processor;
+
+import opennlp.summarization.DocProcessor;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTaggerME;
+
+public class OpenNLPPOSTagger implements POSTagger{
+ private POSTaggerME tagger;
+ private Hashtable<Integer, String[]> tagMap;
+ private DocProcessor dp;
+ private Logger log;
+
+ public OpenNLPPOSTagger(DocProcessor dp, String posModelFileName) throws Exception{
+ log = Logger.getLogger("OpenNLPPOSTagger");
+ InputStream modelIn = null;
+ this.dp = dp;
+ initTagMap();
+ try {
+ modelIn = new FileInputStream(posModelFileName);
+ POSModel model = new POSModel(modelIn);
+ tagger = new POSTaggerME(model);
+ }
+ catch (IOException e) {
+ // Model loading failed, handle the error
+ e.printStackTrace();
+ throw e;
+ }
+ finally {
+ if (modelIn != null) {
+ try {
+ modelIn.close();
+ }
+ catch (IOException e) {
+ }
+ }
+ }
+ }
+
+ private String[] nounTags = {"NN", "NNS","NNP","NNPS"};
+ private void initTagMap()
+ {
+ tagMap = new Hashtable<Integer, String[]>();
+ tagMap.put(POSTagger.NOUN, nounTags);
+ }
+
+ //Returns true if the typestring belongs to one of the tags for the type..
+ public boolean isType(String typeStr, int type)
+ {
+ boolean ret = false;
+ String[] tags = tagMap.get(type);
+ for(String tag: tags)
+ if(typeStr.equalsIgnoreCase(tag)) ret = true;
+
+ return ret;
+ }
+
+ @Override
+ public String getTaggedString(String article) {
+ return tagger.tag(article);
+ }
+
+ @Override
+ public List<String> getWordsOfType(String sent, int type)
+ {
+ List<String> ret = new ArrayList<String>();
+ String[] tokens = dp.getWords(sent);
+ for(String t:tokens)
+ {
+ String[] wordPlusType = t.split("/");
+ if(wordPlusType.length ==2)
+ {
+ if(isType(wordPlusType[1], type))
+ ret.add(wordPlusType[0]);
+ }
+ }
+ log.info(ret.toString());
+ return ret;
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java
new file mode 100644
index 0000000..850222c
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.List;
+
+public interface POSTagger {
+ //Tagger types..
+ public static final int NOUN=0;
+ public static final int VERB=1;
+ public static final int ADJECTIVE=2;
+ public static final int ADVERB=3;
+ public static final int PRONOUN=4;
+
+ public String getTaggedString(String article);
+ public List<String> getWordsOfType(String sent, int type);
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java
new file mode 100644
index 0000000..755adb0
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+public interface Word {
+ //Lexicon..
+ public String getLexicon();
+ public void setLexicon(String lex);
+
+ //Sense of a word..
+ public Object getSense();
+ public void setSense(Object senseID);
+
+ //ID for a word..
+ public Object getID();
+ public void setID(Object id);
+}
\ No newline at end of file
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java
new file mode 100644
index 0000000..2b96d9c
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+public class WordRelation {
+ //Match strength constants for lexical chains..
+ public static int STRONG_RELATION = 0;
+ public static int MED_RELATION = 1;
+ public static int WEAK_RELATION = 2;
+ public static int NO_RELATION = 3;
+
+ public Word src;
+ public Word dest;
+ public int relation;
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
new file mode 100644
index 0000000..bcb6522
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+
+import edu.mit.jwi.data.ILoadPolicy;
+import edu.mit.jwi.item.IIndexWord;
+import edu.mit.jwi.item.ISynset;
+import edu.mit.jwi.item.ISynsetID;
+import edu.mit.jwi.item.IWord;
+import edu.mit.jwi.item.IWordID;
+import edu.mit.jwi.item.POS;
+import edu.mit.jwi.item.Pointer;
+import edu.mit.jwi.Dictionary;
+import edu.mit.jwi.IDictionary;
+import edu.mit.jwi.RAMDictionary;
+
+/*
+ * Uses wordnet to determine the relation of two words.
+ * Words have -
+ * strong relationship: same word
+ * Med relationship: synonym, hyponym
+ * weak relationship: antonym, hypernym..
+ * No relationship: otherwise
+ */
+public class WordRelationshipDetermination {
+
+ IDictionary dictionary;
+ String dictionaryFile="resources/wordnet/dict";
+ int MAX_DIST_MED_REL = 1000;
+
+ public WordRelationshipDetermination() throws Exception
+ {
+ dictionary = new RAMDictionary(new File(dictionaryFile), ILoadPolicy.IMMEDIATE_LOAD);
+ ((RAMDictionary)dictionary).load();
+ openDict();
+ }
+
+ private IWord isSynonynm(String noun, Word w)
+ {
+ WordnetWord ww = (WordnetWord)w;
+ IWord ret = null;
+ IIndexWord idxNoun = dictionary.getIndexWord(noun, POS.NOUN);
+
+ /*getWordIDs() returns all the WordID associated with a index
+ *
+ */
+// for(IWordID wordID : idxWord.getWordIDs())
+ {
+ //Construct an IWord object representing word associated with wordID
+// IWord word = dictionary.getWord(wordID);
+
+ //Get the synset in which word is present.
+ ISynset wordSynset = null;
+ if(ww.synonyms!=null)
+ wordSynset = ww.synonyms;
+ else{
+ IWord word = dictionary.getWord((IWordID)w.getID());
+ wordSynset = word.getSynset();
+ ww.synonyms = wordSynset;
+ }
+ IWord syn = inSynset(wordSynset, idxNoun);
+ if(w!=null){
+ ret = syn;
+// break;
+ }
+ }
+ return ret;
+ }
+ /*
+ * Returns true if the word represented by idxNoun is present in a synset..
+ */
+ Hashtable<ISynset, List<IWord>> synsetWordCache = new Hashtable<ISynset, List<IWord>>();
+ private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun)
+ {
+ IWord ret = null;
+ List<IWord> wrds = null;
+
+ // if(synsetWordCache.get(wordSynset)!=null)
+// wrds = synsetWordCache.get(wordSynset);
+// else{
+ wrds = wordSynset.getWords();
+// synsetWordCache.put(wordSynset, wrds);
+// }
+
+ //Returns all the words present in the synset wordSynset
+ for(IWord synonym : wrds)
+ {
+ for(IWordID nounID : idxNoun.getWordIDs())
+ {
+ if(synonym.equals(dictionary.getWord(nounID)))
+ {
+ ret = synonym;
+ break;
+ }
+ }
+ }
+ return ret;
+ }
+
+ Pointer[] rels = {Pointer.ANTONYM, Pointer.HYPERNYM, Pointer.HYPONYM, Pointer.MERONYM_PART,
+ Pointer.MERONYM_SUBSTANCE, Pointer.PARTICIPLE, Pointer.HYPERNYM_INSTANCE};
+ Hashtable<ISynsetID, ISynset> cache = new Hashtable<ISynsetID, ISynset>();
+ //Returns a word if w has a medium strength relationship with noun. Returns null otherwise.
+ private Word isMediumRel(String noun, Word w)
+ {
+ // openDict();
+ WordnetWord ret = null;
+ WordnetWord ww = (WordnetWord) w;
+ IWord syn = null;
+ if((syn = this.isSynonynm(noun, w))!=null) {
+ ret = new WordnetWord();
+ ret.lexicon = noun;
+ ret.id = syn.getID();
+ ret.wordSense = syn .getSenseKey();
+ }
+
+ //Construct an IWord object representing word associated with wordID
+ IWord word = dictionary.getWord((IWordID)w.getID());
+
+ IIndexWord idxNoun = dictionary.getIndexWord(noun, POS.NOUN);
+ //Get the synset in which word is present.
+ ISynset wordSynset = word.getSynset();
+
+ for(Pointer p : rels)
+ {
+
+ List<ISynsetID> rels = null;
+ if(ww.rels.get(p)!=null)
+ rels = ww.rels.get(p);
+ else{
+ rels = wordSynset.getRelatedSynsets(p);
+ ww.rels.put(p, rels);
+ }
+
+ for(ISynsetID id: rels)
+ {
+ ISynset s = this.dictionary.getSynset(id);
+ IWord mat = inSynset(s, idxNoun);
+ if(mat!=null)
+ {
+ ret = new WordnetWord();
+ ret.lexicon = noun;
+ ret.id = mat.getID();
+ ret.wordSense = mat.getSenseKey();
+ break;
+ }
+ }
+ if(ret!=null) break;
+ }
+
+ return ret;
+ }
+
+ /*
+ * Returns the type of relation between a lexical chain and the noun. The return value is one of STRONG_RELATION, MEDIUM, WEAK, or NO
+ * Strong relation means exact match. Medium relation means synonym or hyponym
+ */
+ public WordRelation getRelation(LexicalChain l, String noun, boolean checkMed) throws Exception{
+ WordRelation ret = new WordRelation();
+ ret.relation = ret.NO_RELATION;
+ for(Word w : l.word)
+ {
+ //Exact match is a string relation..
+ if(w.getLexicon().equalsIgnoreCase(noun))
+ {
+ ret.relation = WordRelation.STRONG_RELATION;
+ ret.src = w;
+ ret.dest = w;
+ break;
+ }
+ // else it is a Wordnet word and is it a synonym or hyponym of LCs (medium relation)
+ else if(w.getID()!=null && checkMed){
+ Word wrel = isMediumRel(noun, w) ;
+ if(wrel!=null)
+ {
+ ret.relation = WordRelation.MED_RELATION;
+ ret.src = w;
+ ret.dest = wrel;
+ break;
+ }
+ }
+ }
+ return ret;
+ }
+
+ private void openDict()
+ {
+ if(!dictionary.isOpen())
+ try {
+ dictionary.open();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ public List<Word> getWordSenses(String noun)
+ {
+ List<Word> ret = new ArrayList<Word>();
+ try{
+ // openDict();
+ List<IWordID> wordIDs = this.dictionary.getIndexWord(noun, POS.NOUN).getWordIDs();
+ for(IWordID wid: wordIDs)
+ {
+ Word w = new WordnetWord();
+ w.setLexicon(noun);
+ w.setID(wid);
+ ret.add(w);
+ }
+ }catch(Exception ex){
+ // ex.printStackTrace();
+ //Not in dictionary
+ Word w = new WordnetWord();
+ w.setLexicon(noun);
+ ret.add(w);
+ }
+ return ret;
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java
new file mode 100644
index 0000000..4d72f9a
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package opennlp.summarization.lexicalchaining;
+
+import java.util.Hashtable;
+import java.util.List;
+
+import edu.mit.jwi.item.IPointer;
+import edu.mit.jwi.item.ISenseKey;
+import edu.mit.jwi.item.ISynset;
+import edu.mit.jwi.item.ISynsetID;
+import edu.mit.jwi.item.IWordID;
+
+public class WordnetWord implements Word{
+ String lexicon;
+ ISenseKey wordSense;
+ IWordID id;
+
+ //Cache..
+ ISynset synonyms;
+ Hashtable<IPointer, List<ISynsetID>>rels;
+
+ public WordnetWord()
+ {
+ rels = new Hashtable<IPointer, List<ISynsetID>>();
+ }
+
+ @Override
+ public String getLexicon() {
+ return lexicon;
+ }
+
+ @Override
+ public Object getSense() {
+ return wordSense;
+ }
+
+ @Override
+ public Object getID() {
+ return id;
+ }
+
+ @Override
+ public void setLexicon(String lex) {
+ this.lexicon = lex;
+ }
+
+ @Override
+ public void setSense(Object senseID) {
+ this.wordSense = (ISenseKey) senseID;
+ }
+
+ @Override
+ public void setID(Object id) {
+ this.id = (IWordID)id;
+ }
+
+ @Override
+ public String toString()
+ {
+ return this.lexicon;
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return toString().hashCode();
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
new file mode 100644
index 0000000..132416b
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.meta;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Hashtable;
+import java.util.List;
+
+import opennlp.summarization.Score;
+import opennlp.summarization.Sentence;
+import opennlp.summarization.lexicalchaining.LexicalChain;
+import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.summarization.textrank.TextRankSummarizer;
+
+import java.util.logging.*;
+
+import opennlp.summarization.DocProcessor;
+/*
+ * A summarizer that combines results from the text rank algorithm and the lexical chaining algorithm.
+ * It runs both algorithm and uses the the lexical chains to identify the main topics and relative importance
+ * and the text rank to pick sentences from lexical chains.
+ */
+public class MetaSummarizer{
+ DocProcessor dp ;
+ TextRankSummarizer textRank;
+ LexicalChainingSummarizer lcs;
+ String sentFragModel = "resources/en-sent.bin";
+
+ public MetaSummarizer(String posModelFile) throws Exception
+ {
+ Logger.getAnonymousLogger().info("Initializing Meta Summarizer");
+ dp = new DefaultDocProcessor(sentFragModel);
+ textRank = new TextRankSummarizer();
+ lcs = new LexicalChainingSummarizer(dp, posModelFile);
+ }
+
+ //An Utility method to sort the ranked sentences by sentence order.
+ private List<Score> order(List<Score> s)
+ {
+ Collections.sort(s, new Comparator<Score>()
+ {
+
+ @Override
+ public int compare(Score o1, Score o2) {
+ // TODO Auto-generated method stub
+
+ return o1.getSentId() - o2.getSentId();
+ }
+ });
+ return s;
+ }
+
+ // Rank sentences by merging the scores from lexical chaining and text rank..
+ // maxWords -1 indicates rank all sentences..
+ public int getBestSent(LexicalChain l, Hashtable<Integer, Score> pageRankScores)
+ {
+ double bestScore = 0; int bestStr=-1;
+ for(Sentence s : l.getSentences())
+ {
+ Score sc = pageRankScores.get(new Integer(s.getSentId()));
+ if(sc!=null && sc.getScore() > bestScore)
+ {
+ bestScore = sc.getScore();
+ bestStr = sc.getSentId();
+ }
+ }
+ return bestStr;
+ }
+
+ public List<Score> rankSentences(String article, List<Sentence> sent, int maxWords)
+ {
+ List<LexicalChain> lc = lcs.buildLexicalChains(article, sent);
+ Collections.sort(lc);
+ Hashtable<Integer, Score> sentScores = new Hashtable<Integer, Score>();
+ try{
+ List<Score> scores = textRank.rankSentences(article, sent, dp, article.length());
+ for(Score s: scores) sentScores.put(s.getSentId(), s);
+ }catch(Exception ex){
+ ex.printStackTrace();
+ }
+
+ Hashtable<Sentence, Boolean> summSents = new Hashtable<Sentence,Boolean>();
+ List<Score> finalSc = new ArrayList<Score>();
+ int currWordCnt = 0;
+ for(int i=lc.size()-1;i>=0;i--)
+ {
+ LexicalChain l = lc.get(i);
+ boolean added =false;
+ while(l.getSentences().size()>0)
+ {
+ int sentId = getBestSent(l, sentScores);
+ if(sentId == -1) break;
+
+ Sentence s = sent.get(sentId);
+
+ //Sentence already added, try again..
+ if(summSents.containsKey(s))
+ l.getSentences().remove(s);
+ else{
+ finalSc.add(sentScores.get(s.getSentId()));
+ summSents.put(s, true);
+ currWordCnt += s.getWordCnt();
+ break;
+ }
+ }
+ if(maxWords>0 && currWordCnt>maxWords) break;
+ }
+
+ order(finalSc);
+ return finalSc;
+ }
+
+ //Default Summarization using only lexical chains..
+ public String summarize(String article, int maxWords)
+ {
+ //Build lexical Chains..
+ List<Sentence> sent = dp.getSentencesFromStr(article);
+
+ List<Score>finalSc = rankSentences(article, sent, maxWords);
+
+ StringBuilder sb = new StringBuilder();
+ for(int i=0;i<finalSc.size();i++)
+ {
+ sb.append(sent.get(finalSc.get(i).getSentId()).toString().trim() +".. ");
+ }
+ // Pick sentences
+ return sb.toString();
+ }
+
+ public static void main(String[] args)
+ {
+ try{
+ String posModelFileName = "./resources/en-pos-maxent.bin";
+ String sentFragModel = "resources/en-sent.bin";
+ DefaultDocProcessor dp =new DefaultDocProcessor(sentFragModel);
+ Logger l = Logger.getAnonymousLogger();
+ MetaSummarizer lcs = new MetaSummarizer(posModelFileName);
+ String article = dp.docToString("test/tax.txt");
+ long strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+
+ article = dp.docToString("test/houston-rep-nopara.txt");
+ strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+
+ article = dp.docToString("gunman.txt");
+ strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+
+ article = dp.docToString("satellite.txt");
+ strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+ }catch(Exception ex){
+ ex.printStackTrace();
+ }
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
new file mode 100755
index 0000000..a623698
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.LineNumberReader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.text.BreakIterator;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Locale;
+import java.util.Hashtable;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import opennlp.summarization.Sentence;
+import opennlp.summarization.DocProcessor;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.stemmer.Stemmer;
+
+
+/*
+ * Parse document to sentences..
+ */
+public class DefaultDocProcessor implements DocProcessor
+{
+ SentenceModel sentModel;
+ Stemmer stemmer;
+ StopWords sw;
+ //Sentence fragmentation to use..
+ static int OPEN_NLP = 1;
+ static int SIMPLE = 2;
+ static int SENTENCE_FRAG= OPEN_NLP;
+
+ public DefaultDocProcessor(String fragModelFile){
+ try {
+ InputStream modelIn = new FileInputStream(fragModelFile);
+ sentModel = new SentenceModel(modelIn);
+ }catch(Exception ex){
+ Logger.getAnonymousLogger().info("Error while parsing.. Ignoring the line and marching on.. "+ ex.getMessage());
+ }
+ }
+
+ //Str - Document or para
+ //sentences - List containing returned sentences
+ // iidx - if not null update with the words in the sentence + sent id
+ // processedSent - Sentences after stemming and stopword removal..
+ private void getSentences(String str, List<String> sentences, Hashtable<String, List<Integer>> iidx, List<String> processedSent)
+ {
+ int oldSentEndIdx = 0;
+ int sentEndIdx = 0;
+ Stemmer stemmer = new PorterStemmer();
+ StopWords sw = StopWords.getInstance();
+ BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
+ BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
+ iterator.setText(str);
+ int start = iterator.first();
+ int sentCnt = 0;
+
+ for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next())
+ {
+ String sentence = str.substring(start,end);//str.substring(oldSentEndIdx, sentEndIdx).trim();
+
+ //Add the sentence as-is; do any processing at the word level..
+ //To lower case and trim all punctuations
+ sentences.add(sentence);
+ wrdItr.setText(sentence);
+ StringBuffer procSent = new StringBuffer();
+ int wrdStrt = 0;
+
+ for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE;
+ wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
+ {
+ String word = sentence.substring(wrdStrt, wrdEnd);//words[i].trim();
+ word.replaceAll("\"|'","");
+
+ //Skip stop words and stem the word..
+ if(sw.isStopWord(word)) continue;
+
+ String stemedWrd = stemmer.stem(word).toString();
+
+ //update iidx by adding the current sentence to the list..
+ if(iidx!=null)
+ {
+ if(stemedWrd.length()>1)
+ {
+ List<Integer> sentList= iidx.get(stemedWrd);
+ if(sentList==null)
+ {
+ sentList = new ArrayList<Integer>();
+ }
+
+ sentList.add(sentCnt);
+ //Save it back
+ iidx.put(stemedWrd, sentList);
+ }
+ }
+ procSent.append(stemedWrd+" ");
+ }
+
+ sentCnt++;
+ if(processedSent!=null )
+ processedSent.add(procSent.toString());
+ }
+ }
+
+
+ public String docToString(String fileName)
+ {
+ LineNumberReader lnr = null;
+ StringBuffer docBuffer = new StringBuffer();
+
+ try {
+ lnr = new LineNumberReader(new FileReader(fileName));
+ String nextLine;
+
+ while ((nextLine = lnr.readLine()) != null) {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty() ) {
+ docBuffer.append(trimmedLine.replaceAll("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;", "")+" ");
+ }
+ }
+ } catch (Exception ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ } finally {
+ try {
+ lnr.close();
+ } catch (IOException ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ return docBuffer.toString();
+ }
+
+ //List of sentences form a document
+ public List<Sentence> docToSentList(String fileName)
+ {
+ List<Sentence> sentList = new ArrayList<Sentence>();
+ LineNumberReader lnr = null;
+ StringBuffer docBuffer = new StringBuffer();
+
+ try {
+ lnr = new LineNumberReader(new FileReader(fileName));
+ String nextLine;
+ int paraNo =0;
+ int sentNo = 0;
+ while ((nextLine = lnr.readLine()) != null) {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty()) {
+ List<String> sents = new ArrayList<String>();
+ List<String> cleanedSents = new ArrayList<String>();
+ this.getSentences(trimmedLine, sents, null, cleanedSents);
+ int paraPos = 1;
+ for(String sen:sents)
+ {
+ Sentence s = new Sentence();
+ s.setSentId(sentNo++);
+ s.setParagraph(paraNo);
+ s.setStringVal(sen);
+ s.setParaPos(paraPos++);
+ sentList.add(s);
+ }
+ paraNo++;
+ }
+ }
+
+ String doc = docBuffer.toString();
+ } catch (Exception ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ ex.printStackTrace();
+ } finally {
+ try {
+ lnr.close();
+ } catch (IOException ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ return sentList;
+ }
+
+
+ public List<Sentence> getSentencesFromStr(String text) {
+ List<Sentence> ret = new ArrayList<Sentence>();
+
+ List<String> sentStrs = new ArrayList<String>();
+ List<String> cleanedSents = new ArrayList<String>();
+
+ //Custom/simple method if specified or open nlp model was not found..
+ if(sentModel==null || SENTENCE_FRAG==SIMPLE)
+ getSentences(text, sentStrs, null, cleanedSents);
+ else{
+ SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
+ String[] sentences = sentenceDetector.sentDetect(text);
+ for(String sentence : sentences)
+ {
+ Logger.getLogger("DocProcessor").info(sentence);
+ sentStrs.add(sentence);
+ }
+ }
+ int sentNo = 0;
+
+ for(String sen:sentStrs)
+ {
+ Sentence s = new Sentence();
+ s.setSentId(sentNo);
+ s.setParagraph(1);
+ s.setStringVal(sen);
+ s.setParaPos(sentNo);
+ ret.add(s);
+ sentNo++;
+ }
+ return ret;
+ }
+
+
+ public String[] getWords(String sent)
+ {
+ return sent.split(" ");
+ }
+
+ @Override
+ public Stemmer getStemmer() {
+ // TODO Auto-generated method stub
+ return stemmer;
+ }
+
+}
\ No newline at end of file
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java b/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java
new file mode 100755
index 0000000..e733703
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+import java.util.Hashtable;
+import java.io.File;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+
+import com.sun.istack.internal.logging.Logger;
+
+/*
+ * Class to load inverse document frequency for words. Resources like google n-gram can be used to populate this.
+ *
+ */
+public class IDFWordWeight implements WordWeight
+{
+ Hashtable<String, Double> idf;
+ private static IDFWordWeight instance;
+
+ public IDFWordWeight(String fileName)
+ {
+ idf = new Hashtable<String,Double>();
+ load(fileName);
+ }
+
+ public static IDFWordWeight getInstance(String fileName)
+ {
+ if(instance==null)
+ instance = new IDFWordWeight(fileName);
+ return instance;
+ }
+
+ public double getWordWeight(String s)
+ {
+ if(idf==null) return 1d;
+
+ Double d = idf.get(s);
+ if(d == null)
+ {
+ return 1;
+ }
+ return d.doubleValue();
+ }
+
+ /*
+ * Loads the IDF for words from given file. The file is required to have a simple format -
+ * word, IDF.
+ */
+ public void load(String fileName)
+ {
+ try{
+ LineNumberReader lnr = new LineNumberReader(new FileReader(fileName));
+ String nextLine;
+
+ while ((nextLine = lnr.readLine()) != null)
+ {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty())
+ {
+ String[] tokens = trimmedLine.split(",");
+ String word = tokens[0]; double idfVal = Double.parseDouble(tokens[1]);
+ idf.put(word, idfVal);
+ }
+ }
+ }catch(Exception ex){
+ Logger.getLogger(opennlp.summarization.preprocess.IDFWordWeight.class).warning("Could not load the file with IDF");
+ }
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java b/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java
new file mode 100755
index 0000000..5aa90ae
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java
@@ -0,0 +1,391 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+/*
+
+ Porter stemmer in Java. The original paper is in
+
+ Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+ no. 3, pp 130-137,
+
+ See also http://www.tartarus.org/~martin/PorterStemmer
+
+ History:
+
+ Release 1
+
+ Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
+ The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
+ is then out outside the bounds of b.
+
+ Release 2
+
+ Similarly,
+
+ Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
+ 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
+ b[j] is then outside the bounds of b.
+
+ Release 3
+
+ Considerably revised 4/9/00 in the light of many helpful suggestions
+ from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
+
+ Release 4
+
+*/
+
+import java.io.*;
+
+import opennlp.tools.stemmer.Stemmer;
+
+/**
+ * Stemmer, implementing the Porter Stemming Algorithm
+ *
+ * The Stemmer class transforms a word into its root form. The input
+ * word can be provided a character at time (by calling add()), or at once
+ * by calling one of the various stem(something) methods.
+ */
+
+public class PorterStemmer implements Stemmer
+{ private char[] b;
+ private int i, /* offset into b */
+ i_end, /* offset to end of stemmed word */
+ j, k;
+ private static final int INC = 50;
+ /* unit of size whereby b is increased */
+ public PorterStemmer()
+ { b = new char[INC];
+ i = 0;
+ i_end = 0;
+ }
+
+ /**
+ * Add a character to the word being stemmed. When you are finished
+ * adding characters, you can call stem(void) to stem the word.
+ */
+
+ public void add(char ch)
+ { if (i == b.length)
+ { char[] new_b = new char[i+INC];
+ for (int c = 0; c < i; c++) new_b[c] = b[c];
+ b = new_b;
+ }
+ b[i++] = ch;
+ }
+
+
+ /** Adds wLen characters to the word being stemmed contained in a portion
+ * of a char[] array. This is like repeated calls of add(char ch), but
+ * faster.
+ */
+
+ public void add(char[] w, int wLen)
+ { if (i+wLen >= b.length)
+ { char[] new_b = new char[i+wLen+INC];
+ for (int c = 0; c < i; c++) new_b[c] = b[c];
+ b = new_b;
+ }
+ for (int c = 0; c < wLen; c++) b[i++] = w[c];
+ }
+
+ /**
+ * After a word has been stemmed, it can be retrieved by toString(),
+ * or a reference to the internal buffer can be retrieved by getResultBuffer
+ * and getResultLength (which is generally more efficient.)
+ */
+ public String toString() { return new String(b,0,i_end); }
+
+ /**
+ * Returns the length of the word resulting from the stemming process.
+ */
+ public int getResultLength() { return i_end; }
+
+ /**
+ * Returns a reference to a character buffer containing the results of
+ * the stemming process. You also need to consult getResultLength()
+ * to determine the length of the result.
+ */
+ public char[] getResultBuffer() { return b; }
+
+ /* cons(i) is true <=> b[i] is a consonant. */
+
+ private final boolean cons(int i)
+ { switch (b[i])
+ { case 'a': case 'e': case 'i': case 'o': case 'u': return false;
+ case 'y': return (i==0) ? true : !cons(i-1);
+ default: return true;
+ }
+ }
+
+ /* m() measures the number of consonant sequences between 0 and j. if c is
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+ */
+
+ private final int m()
+ { int n = 0;
+ int i = 0;
+ while(true)
+ { if (i > j) return n;
+ if (! cons(i)) break; i++;
+ }
+ i++;
+ while(true)
+ { while(true)
+ { if (i > j) return n;
+ if (cons(i)) break;
+ i++;
+ }
+ i++;
+ n++;
+ while(true)
+ { if (i > j) return n;
+ if (! cons(i)) break;
+ i++;
+ }
+ i++;
+ }
+ }
+
+ /* vowelinstem() is true <=> 0,...j contains a vowel */
+
+ private final boolean vowelinstem()
+ { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
+ return false;
+ }
+
+ /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+ private final boolean doublec(int j)
+ { if (j < 1) return false;
+ if (b[j] != b[j-1]) return false;
+ return cons(j);
+ }
+
+ /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short word. e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+
+ */
+
+ private final boolean cvc(int i)
+ { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
+ { int ch = b[i];
+ if (ch == 'w' || ch == 'x' || ch == 'y') return false;
+ }
+ return true;
+ }
+
+ private final boolean ends(String s)
+ { int l = s.length();
+ int o = k-l+1;
+ if (o < 0) return false;
+ for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
+ j = k-l;
+ return true;
+ }
+
+ /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
+ k. */
+
+ private final void setto(String s)
+ { int l = s.length();
+ int o = j+1;
+ for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
+ k = j+l;
+ }
+
+ /* r(s) is used further down. */
+
+ private final void r(String s) { if (m() > 0) setto(s); }
+
+ /* step1() gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+
+ */
+
+ private final void step1()
+ { if (b[k] == 's')
+ { if (ends("sses")) k -= 2; else
+ if (ends("ies")) setto("i"); else
+ if (b[k-1] != 's') k--;
+ }
+ if (ends("eed")) { if (m() > 0) k--; } else
+ if ((ends("ed") || ends("ing")) && vowelinstem())
+ { k = j;
+ if (ends("at")) setto("ate"); else
+ if (ends("bl")) setto("ble"); else
+ if (ends("iz")) setto("ize"); else
+ if (doublec(k))
+ { k--;
+ { int ch = b[k];
+ if (ch == 'l' || ch == 's' || ch == 'z') k++;
+ }
+ }
+ else if (m() == 1 && cvc(k)) setto("e");
+ }
+ }
+
+ /* step2() turns terminal y to i when there is another vowel in the stem. */
+
+ private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
+
+ /* step3() maps double suffices to single ones. so -ization ( = -ize plus
+ -ation) maps to -ize etc. note that the string before the suffix must give
+ m() > 0. */
+
+ private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
+ {
+ case 'a': if (ends("ational")) { r("ate"); break; }
+ if (ends("tional")) { r("tion"); break; }
+ break;
+ case 'c': if (ends("enci")) { r("ence"); break; }
+ if (ends("anci")) { r("ance"); break; }
+ break;
+ case 'e': if (ends("izer")) { r("ize"); break; }
+ break;
+ case 'l': if (ends("bli")) { r("ble"); break; }
+ if (ends("alli")) { r("al"); break; }
+ if (ends("entli")) { r("ent"); break; }
+ if (ends("eli")) { r("e"); break; }
+ if (ends("ousli")) { r("ous"); break; }
+ break;
+ case 'o': if (ends("ization")) { r("ize"); break; }
+ if (ends("ation")) { r("ate"); break; }
+ if (ends("ator")) { r("ate"); break; }
+ break;
+ case 's': if (ends("alism")) { r("al"); break; }
+ if (ends("iveness")) { r("ive"); break; }
+ if (ends("fulness")) { r("ful"); break; }
+ if (ends("ousness")) { r("ous"); break; }
+ break;
+ case 't': if (ends("aliti")) { r("al"); break; }
+ if (ends("iviti")) { r("ive"); break; }
+ if (ends("biliti")) { r("ble"); break; }
+ break;
+ case 'g': if (ends("logi")) { r("log"); break; }
+ } }
+
+ /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
+
+ private final void step4() { switch (b[k])
+ {
+ case 'e': if (ends("icate")) { r("ic"); break; }
+ if (ends("ative")) { r(""); break; }
+ if (ends("alize")) { r("al"); break; }
+ break;
+ case 'i': if (ends("iciti")) { r("ic"); break; }
+ break;
+ case 'l': if (ends("ical")) { r("ic"); break; }
+ if (ends("ful")) { r(""); break; }
+ break;
+ case 's': if (ends("ness")) { r(""); break; }
+ break;
+ } }
+
+ /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+ private final void step5()
+ { if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
+ { case 'a': if (ends("al")) break; return;
+ case 'c': if (ends("ance")) break;
+ if (ends("ence")) break; return;
+ case 'e': if (ends("er")) break; return;
+ case 'i': if (ends("ic")) break; return;
+ case 'l': if (ends("able")) break;
+ if (ends("ible")) break; return;
+ case 'n': if (ends("ant")) break;
+ if (ends("ement")) break;
+ if (ends("ment")) break;
+ /* element etc. not stripped before the m */
+ if (ends("ent")) break; return;
+ case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
+ /* j >= 0 fixes Bug 2 */
+ if (ends("ou")) break; return;
+ /* takes care of -ous */
+ case 's': if (ends("ism")) break; return;
+ case 't': if (ends("ate")) break;
+ if (ends("iti")) break; return;
+ case 'u': if (ends("ous")) break; return;
+ case 'v': if (ends("ive")) break; return;
+ case 'z': if (ends("ize")) break; return;
+ default: return;
+ }
+ if (m() > 1) k = j;
+ }
+
+ /* step6() removes a final -e if m() > 1. */
+
+ private final void step6()
+ { j = k;
+ if (b[k] == 'e')
+ { int a = m();
+ if (a > 1 || a == 1 && !cvc(k-1)) k--;
+ }
+ if (b[k] == 'l' && doublec(k) && m() > 1) k--;
+ }
+
+ /** Stem the word placed into the Stemmer buffer through calls to add().
+ * Returns true if the stemming process resulted in a word different
+ * from the input. You can retrieve the result with
+ * getResultLength()/getResultBuffer() or toString().
+ */
+ public void stem()
+ { k = i - 1;
+ if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
+ i_end = k+1; i = 0;
+ }
+
+ public CharSequence stem(CharSequence word)
+ {
+ b = new char[word.length()];
+ char[] arr = word.toString().toCharArray();
+ for(k=0;k<arr.length;k++) this.add(arr[k]);
+ stem();
+ return this.toString();
+ }
+}
+
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java b/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java
new file mode 100755
index 0000000..f1c4be4
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+import java.util.Hashtable;
+
+/**
+ *
+ * @author rtww
+ */
+public class StopWords {
+ private Hashtable<String, Boolean> h;
+ private static StopWords instance;
+
+ public StopWords()
+ {
+ h = new Hashtable<String, Boolean>();
+ h.put("0", true);
+ h.put("1", true);
+ h.put("2", true);
+ h.put("3", true);
+ h.put("4", true);
+ h.put("5", true);
+ h.put("6", true);
+ h.put("7", true);
+ h.put("8", true);
+ h.put("9", true);
+
+ h.put("a", true);
+ h.put("about", true);
+ h.put("above", true);
+ h.put("after", true);
+ h.put("again", true);
+ h.put("against", true);
+ h.put("all", true);
+ h.put("am", true);
+ h.put("an", true);
+ h.put("and", true);
+ h.put("any", true);
+ h.put("are", true);
+ h.put("aren't", true);
+ h.put("as", true);
+ h.put("at", true);
+ h.put("be", true);
+ h.put("because", true);
+ h.put("been", true);
+ h.put("before", true);
+ h.put("being", true);
+ h.put("below", true);
+ h.put("between", true);
+ h.put("both", true);
+ h.put("but", true);
+ h.put("by", true);
+ h.put("can't", true);
+ h.put("cannot", true);
+ h.put("could", true);
+ h.put("couldn't", true);
+ h.put("did", true);
+ h.put("didn't", true);
+ h.put("do", true);
+ h.put("does", true);
+ h.put("doesn't", true);
+ h.put("doing", true);
+ h.put("don't", true);
+ h.put("down", true);
+ h.put("during", true);
+ h.put("each", true);
+ h.put("few", true);
+ h.put("for", true);
+ h.put("from", true);
+ h.put("further", true);
+ h.put("had", true);
+ h.put("hadn't", true);
+ h.put("has", true);
+ h.put("hasn't", true);
+ h.put("have", true);
+ h.put("haven't", true);
+ h.put("having", true);
+ h.put("he", true);
+ h.put("he'd", true);
+ h.put("he'll", true);
+ h.put("he's", true);
+ h.put("her", true);
+ h.put("here", true);
+ h.put("here's", true);
+ h.put("hers", true);
+ h.put("herself", true);
+ h.put("him", true);
+ h.put("himself", true);
+ h.put("his", true);
+ h.put("how", true);
+ h.put("how's", true);
+ h.put("i", true);
+ h.put("i'd", true);
+ h.put("i'll", true);
+ h.put("i'm", true);
+ h.put("i've", true);
+ h.put("if", true);
+ h.put("in", true);
+ h.put("into", true);
+ h.put("is", true);
+ h.put("isn't", true);
+ h.put("it", true);
+ h.put("it's", true);
+ h.put("its", true);
+ h.put("itself", true);
+ h.put("let's", true);
+ h.put("me", true);
+ h.put("more", true);
+ h.put("most", true);
+ h.put("mustn't", true);
+ h.put("my", true);
+ h.put("myself", true);
+ h.put("no", true);
+ h.put("nor", true);
+ h.put("not", true);
+ h.put("of", true);
+ h.put("off", true);
+ h.put("on", true);
+ h.put("once", true);
+ h.put("only", true);
+ h.put("or", true);
+ h.put("other", true);
+ h.put("ought", true);
+ h.put("our", true);
+ h.put("ours ", true);
+ h.put(" ourselves", true);
+ h.put("out", true);
+ h.put("over", true);
+ h.put("own", true);
+ h.put("same", true);
+ h.put("shan't", true);
+ h.put("she", true);
+ h.put("she'd", true);
+ h.put("she'll", true);
+ h.put("she's", true);
+ h.put("should", true);
+ h.put("shouldn't", true);
+ h.put("so", true);
+ h.put("some", true);
+ h.put("say", true);
+ h.put("said", true);
+ h.put("such", true);
+ h.put("than", true);
+ h.put("that", true);
+ h.put("that's", true);
+ h.put("the", true);
+ h.put("their", true);
+ h.put("theirs", true);
+ h.put("them", true);
+ h.put("themselves", true);
+ h.put("then", true);
+ h.put("there", true);
+ h.put("there's", true);
+ h.put("these", true);
+ h.put("they", true);
+ h.put("they'd", true);
+ h.put("they'll", true);
+ h.put("they're", true);
+ h.put("they've", true);
+ h.put("this", true);
+ h.put("those", true);
+ h.put("through", true);
+ h.put("to", true);
+ h.put("too", true);
+ h.put("under", true);
+ h.put("until", true);
+ h.put("up", true);
+ h.put("very", true);
+ h.put("was", true);
+ h.put("wasn't", true);
+ h.put("we", true);
+ h.put("we'd", true);
+ h.put("we'll", true);
+ h.put("we're", true);
+ h.put("we've", true);
+ h.put("were", true);
+ h.put("weren't", true);
+ h.put("what", true);
+ h.put("what's", true);
+ h.put("when", true);
+ h.put("when's", true);
+ h.put("where", true);
+ h.put("where's", true);
+ h.put("which", true);
+ h.put("while", true);
+ h.put("who", true);
+ h.put("who's", true);
+ h.put("whom", true);
+ h.put("why", true);
+ h.put("why's", true);
+ h.put("with", true);
+ h.put("won't", true);
+ h.put("would", true);
+ h.put("wouldn't", true);
+ h.put("you", true);
+ h.put("you'd", true);
+ h.put("you'll", true);
+ h.put("you're", true);
+ h.put("you've", true);
+ h.put("your", true);
+ h.put("yours", true);
+ h.put("yourself", true);
+ h.put("yourselves ", true);
+ }
+
+ public boolean isStopWord(String s)
+ {
+ boolean ret = h.get(s)==null? false: true;
+ if(s.length()==1) ret = true;
+ return ret;
+ }
+
+ public static StopWords getInstance()
+ {
+ if(instance == null)
+ instance = new StopWords();
+ return instance;
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java b/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java
new file mode 100755
index 0000000..f360036
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+public interface WordWeight
+{
+ public double getWordWeight(String s);
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
new file mode 100755
index 0000000..57dcf25
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.textrank;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.List;
+
+import opennlp.summarization.*;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.summarization.preprocess.IDFWordWeight;
+import opennlp.summarization.preprocess.PorterStemmer;
+import opennlp.summarization.preprocess.StopWords;
+import opennlp.summarization.preprocess.WordWeight;
+
+/*
+ * Implements the TextRank algorithm by Mihalcea et al.
+ * This basically applies the page rank algorithm to a graph where each sentence is a node and a connection between sentences
+ * indicates that a word is shared between them. It returns a ranking of sentences where highest rank means most important etc.
+ * Currently only stemming is done to the words - a more sophisticated way might use a resource like Wordnet to match synonyms etc.
+ */
+public class TextRank {
+ private StopWords sw;
+ private String article;
+ private Hashtable<Integer, List<Integer>> links;
+ private List<String> sentences = new ArrayList<String>();
+ private List<String> processedSent = new ArrayList<String>();
+ private WordWeight wordWt;
+ private int NO_OF_IT = 100;
+ private double maxErr = 0.1;
+ private DocProcessor docProc;
+
+ private double title_wt = 0;
+ private Hashtable<Integer, String[]> wordsInSent;
+
+ // DAMPING FACTOR..
+ private static double df = 0.15;
+ private boolean HIGHER_TITLE_WEIGHT = true;
+ private static double TITLE_WRD_WT = 2d;
+ private String resources = "./resources";
+
+ public TextRank(DocProcessor dp) {
+ sw = new StopWords();
+ setLinks(new Hashtable<Integer, List<Integer>>());
+ processedSent = new ArrayList<String>();
+ docProc = dp;
+ wordWt = IDFWordWeight.getInstance(resources + "/idf.csv");
+ }
+
+ public TextRank(StopWords sw, WordWeight wordWts) {
+ this.sw = sw;
+ this.wordWt = wordWts;
+ }
+
+ // Returns similarity of two sentences. Wrd wts contains tf-idf of the
+ // words..
+ public double getWeightedSimilarity(String sent1, String sent2,
+ Hashtable<String, Double> wrdWts) {
+ String[] words1 = sent1.split(" ");
+ String[] words2 = sent2.split(" ");
+ double wordsInCommon = 0;
+ Hashtable<String, Boolean> dups = new Hashtable<String, Boolean>();
+ for (int i = 0; i < words1.length; i++) {
+ String currWrd1 = words1[i].trim();
+ // skip over duplicate words of sentence
+ if (dups.get(currWrd1) == null) {
+ dups.put(currWrd1, true);
+ for (int j = 0; j < words2.length; j++) {
+ if (!sw.isStopWord(currWrd1) && !currWrd1.isEmpty()
+ && words1[i].equals(words2[j])) {
+ Double wt;
+
+ wt = wrdWts.get(currWrd1);
+ if (wt != null)
+ wordsInCommon += wt.doubleValue();
+ else
+ wordsInCommon++;
+ }
+ }
+ }
+ }
+ return ((double) ((wordsInCommon)))
+ / (words1.length + words2.length);
+ }
+
+ // Gets the current score from the list of scores passed ...
+ public double getScoreFrom(List<Score> scores, int id) {
+ for (Score s : scores) {
+ if (s.getSentId() == id)
+ return s.getScore();
+ }
+ return 1;
+ }
+
+ // This method runs the page rank algorithm for the sentences.
+ // TR(Vi) = (1-d) + d * sigma over neighbors Vj( wij/sigma over k neighbor
+ // of j(wjk) * PR(Vj) )
+ public List<Score> getTextRankScore(List<Score> rawScores,
+ List<String> sentences, Hashtable<String, Double> wrdWts) {
+ List<Score> currWtScores = new ArrayList<Score>();
+ // Start with equal weights for all sentences
+ for (int i = 0; i < rawScores.size(); i++) {
+ Score ns = new Score();
+ ns.setSentId(rawScores.get(i).getSentId());
+ ns.setScore((1 - title_wt) / (rawScores.size()));// this.getSimilarity();
+ currWtScores.add(ns);
+ }
+ // currWtScores.get(0).score = this.title_wt;
+
+ // Page rank..
+ for (int i = 0; i < NO_OF_IT; i++) {
+ double totErr = 0;
+ List<Score> newWtScores = new ArrayList<Score>();
+
+ // Update the scores for the current iteration..
+ for (Score rs : rawScores) {
+ int sentId = rs.getSentId();
+ Score ns = new Score();
+ ns.setSentId(sentId);
+
+ List<Integer> neighbors = getLinks().get(sentId);
+ double sum = 0;
+ if (neighbors != null) {
+ for (Integer j : neighbors) {
+ // sum += getCurrentScore(rawScores,
+ // sentId)/(getCurrentScore(rawScores, neigh)) *
+ // getCurrentScore(currWtScores, neigh);
+ double wij = this.getWeightedSimilarity(sentences
+ .get(sentId), sentences.get(j), wrdWts);
+ double sigmawjk = getScoreFrom(rawScores, j);
+ double txtRnkj = getScoreFrom(currWtScores, j);
+ sum += wij / sigmawjk * txtRnkj;
+ }
+ }
+ ns.setScore((1d - df) + sum * df);// * rs.score
+ totErr += ns.getScore() - getScoreFrom(rawScores, sentId);
+ newWtScores.add(ns);
+ }
+ currWtScores = newWtScores;
+ if (i > 2 && totErr / rawScores.size() < maxErr)
+ break;
+ }
+
+ for (int i = 0; i < currWtScores.size(); i++) {
+ Score s = currWtScores.get(i);
+ s.setScore(s.getScore() * getScoreFrom(rawScores, s.getSentId()));
+ }
+ return currWtScores;
+ }
+
+ // Raw score is sigma wtsimilarity of neighbors..
+ // Used in the denominator of the Text rank formula..
+ public List<Score> getNeighborsSigmaWtSim(List<String> sentences,
+ Hashtable<String, List<Integer>> iidx, Hashtable<String, Double> wts) {
+ List<Score> allScores = new ArrayList<Score>();
+
+ for (int i = 0; i < sentences.size(); i++) {
+ String nextSent = sentences.get(i);
+ String[] words = nextSent.split(" ");
+ List<Integer> processed = new ArrayList<Integer>();
+ Score s = new Score();
+ s.setSentId(i);
+
+ for (int j = 0; j < words.length; j++) {
+ String currWrd = docProc.getStemmer().stem(words[j]).toString();//stemmer.toString();
+
+ List<Integer> otherSents = iidx.get(currWrd);
+ if (otherSents == null)
+ continue;
+
+ for (int k = 0; k < otherSents.size(); k++) {
+ int idx = otherSents.get(k);
+
+ if (idx != i && !processed.contains(idx)) {
+ double currS = getWeightedSimilarity(sentences.get(i),
+ sentences.get(idx), wts);
+ s.setScore(s.getScore() + currS);
+
+ if (currS > 0) {
+ addLink(i, idx);
+ }
+ processed.add(idx);
+ }
+ }
+ }
+ allScores.add(s);
+ }
+ return allScores;
+ }
+
+ public List<Score> getWeightedScores(List<Score> rawScores,
+ List<String> sentences, Hashtable<String, Double> wordWts) {
+ List<Score> weightedScores = this.getTextRankScore(rawScores,
+ sentences, wordWts);
+ Collections.sort(weightedScores);
+ return weightedScores;
+ }
+
+ private Hashtable<String, Double> toWordWtHashtable(WordWeight wwt,
+ Hashtable<String, List<Integer>> iidx) {
+ Hashtable<String, Double> wrdWt = new Hashtable<String, Double>();
+ Enumeration<String> keys = iidx.keys();
+ while (keys.hasMoreElements()) {
+ String key = keys.nextElement();
+ wrdWt.put(key, wwt.getWordWeight(key));
+ }
+ return wrdWt;
+ }
+
+ public List<Score> getRankedSentences(String doc, List<String> sentences,
+ Hashtable<String, List<Integer>> iidx, List<String> processedSent) {
+ this.sentences = sentences;
+ this.processedSent = processedSent;
+
+ List<Integer> chosenOnes = new ArrayList<Integer>();
+
+ Hashtable<String, Double> wrdWts = toWordWtHashtable(this.wordWt, iidx);// new
+ // Hashtable<String,
+ // Double>();
+
+ if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) {
+ String sent = getSentences().get(0);
+ String[] wrds = sent.split(" ");
+ for (String wrd : wrds)
+ wrdWts.put(wrd, new Double(TITLE_WRD_WT));
+ }
+
+ List<Score> rawScores = getNeighborsSigmaWtSim(getSentences(), iidx,
+ wrdWts);
+ List<Score> finalScores = getWeightedScores(rawScores, getSentences(),
+ wrdWts);
+
+ Score bestScr = null;
+ int next = 0;
+
+ return finalScores;
+ }
+
+ // Set a link between two sentences..
+ private void addLink(int i, int idx) {
+ List<Integer> endNodes = getLinks().get(i);
+ if (endNodes == null)
+ endNodes = new ArrayList<Integer>();
+ endNodes.add(idx);
+ getLinks().put(i, endNodes);
+ }
+
+ public void setSentences(List<String> sentences) {
+ this.sentences = sentences;
+ }
+
+ public List<String> getSentences() {
+ return sentences;
+ }
+
+ public void setArticle(String article) {
+ this.article = article;
+ }
+
+ public String getArticle() {
+ return article;
+ }
+
+ private void setLinks(Hashtable<Integer, List<Integer>> links) {
+ this.links = links;
+ }
+
+ public Hashtable<Integer, List<Integer>> getLinks() {
+ return links;
+ }
+}
+
+/*
+ * public double getScore(String sent1, String sent2, boolean toPrint) {
+ * String[] words1 = sent1.split(" "); String[] words2 = sent2.split(" ");
+ * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
+ * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
+ * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+=
+ * wordWt.getWordWeight(words1[i]); } } } return ((double)wordsInCommon) /
+ * (Math.log(1+words1.length) + Math.log(1+words2.length)); }
+ */
\ No newline at end of file
diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
new file mode 100755
index 0000000..e60c574
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.textrank;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.PrintWriter;
+import java.util.*;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import opennlp.summarization.*;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.summarization.preprocess.IDFWordWeight;
+import opennlp.summarization.preprocess.WordWeight;
+
+/*
+ * A wrapper around the text rank algorithm. This class
+ * a) Sets up the data for the TextRank class
+ * b) Takes the ranked sentences and does some basic rearranging (e.g. ordering) to provide a more reasonable summary.
+ */
+public class TextRankSummarizer implements Summarizer
+{
+ //An optional file to store idf of words. If idf is not available it uses a default equal weight for all words.
+ private String idfFile = "resources/idf.csv";
+ public TextRankSummarizer() throws Exception
+ {
+ }
+
+ /*Sets up data and calls the TextRank algorithm..*/
+ public List<Score> rankSentences(String doc, List<Sentence> sentences,
+ DocProcessor dp, int maxWords )
+ {
+ try {
+ //Rank sentences
+ TextRank summ = new TextRank(dp);
+ List<String> sentenceStrL = new ArrayList<String>();
+ List<String> processedSent = new ArrayList<String>();
+ Hashtable<String, List<Integer>> iidx = new Hashtable<String, List<Integer>>();
+ // dp.getSentences(sentences, sentenceStrL, iidx, processedSent);
+
+ for(Sentence s : sentences){
+ sentenceStrL.add(s.getStringVal());
+ String stemmedSent = s.stem();
+ processedSent.add(stemmedSent);
+
+ String[] wrds = stemmedSent.split(" ");
+ for(String w: wrds)
+ {
+ if(iidx.get(w)!=null)
+ iidx.get(w).add(s.getSentId());
+ else{
+ List<Integer> l = new ArrayList<Integer>();
+ l.add(s.getSentId());
+ iidx.put(w, l);
+ }
+ }
+ }
+
+ WordWeight wordWt = new IDFWordWeight(idfFile);////new
+
+ List<Score> finalScores = summ.getRankedSentences(doc, sentenceStrL, iidx, processedSent);
+ List<String> sentenceStrList = summ.getSentences();
+
+ // SentenceClusterer clust = new SentenceClusterer();
+ // clust.runClusterer(doc, summ.processedSent);
+
+ Hashtable<Integer,List<Integer>> links= summ.getLinks();
+
+ for(int i=0;i<sentences.size();i++)
+ {
+ Sentence st = sentences.get(i);
+
+ //Add links..
+ List<Integer> currLnks = links.get(i);
+ if(currLnks==null) continue;
+ for(int j=0;j<currLnks.size();j++)
+ {
+ if(j<i) st.addLink(sentences.get(j));
+ }
+ }
+
+ for(int i=0;i<finalScores.size();i++)
+ {
+ Score s = finalScores.get(i);
+ Sentence st = sentences.get(s.getSentId());
+ st.setPageRankScore(s);
+ }
+
+ List<Score> reRank = finalScores;//reRank(sentences, finalScores, iidx, wordWt, maxWords);
+
+ return reRank;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ //Returns the summary as a string.
+ @Override
+ public String summarize(String article, DocProcessor dp, int maxWords) {
+ List<Sentence> sentences = dp.getSentencesFromStr(article);
+ List<Score> scores = this.rankSentences(article, sentences, dp, maxWords);
+ return scores2String(sentences, scores, maxWords);
+ }
+
+ /* Use the page rank scores to determine the summary.*/
+ public String scores2String(List<Sentence> sentences, List<Score> scores, int maxWords)
+ {
+ StringBuffer b = new StringBuffer();
+ // for(int i=0;i< min(maxWords, scores.size()-1);i++)
+ int i=0;
+ while(b.length()< maxWords && i< scores.size())
+ {
+ String sent = sentences.get(scores.get(i).getSentId()).getStringVal();
+ b.append(sent + scores.get(i));
+ i++;
+ }
+ return b.toString();
+ }
+
+}
diff --git a/summarizer/src/test/java/unittests/DocProcessorTest.java b/summarizer/src/test/java/unittests/DocProcessorTest.java
new file mode 100644
index 0000000..2b8f723
--- /dev/null
+++ b/summarizer/src/test/java/unittests/DocProcessorTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+
+package unittests;
+
+import static org.junit.Assert.*;
+
+import org.junit.Assert.*;
+
+import java.io.UnsupportedEncodingException;
+import java.util.List;
+
+import opennlp.summarization.Sentence;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class DocProcessorTest {
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ }
+
+ @Test
+ public void testGetSentencesFromStr() {
+ String sentFragModel = "resources/en-sent.bin";
+ DefaultDocProcessor dp =new DefaultDocProcessor(sentFragModel);
+ String sent="This is a sentence, with some punctuations; to test if the sentence breaker can handle it! Is every thing working OK ? Yes.";
+ List<Sentence> doc = dp.getSentencesFromStr(sent);//dp.docToString(fileName);//
+ assertEquals(doc.size(),3);
+ }
+
+}
diff --git a/summarizer/src/test/java/unittests/LexChainTest.java b/summarizer/src/test/java/unittests/LexChainTest.java
new file mode 100644
index 0000000..60da959
--- /dev/null
+++ b/summarizer/src/test/java/unittests/LexChainTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package unittests;
+
+import static org.junit.Assert.*;
+import opennlp.summarization.Sentence;
+import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
+import opennlp.summarization.lexicalchaining.LexicalChain;
+import opennlp.summarization.lexicalchaining.*;
+import opennlp.summarization.lexicalchaining.Word;
+import opennlp.summarization.lexicalchaining.WordRelation;
+import opennlp.summarization.lexicalchaining.WordRelationshipDetermination;
+import opennlp.summarization.lexicalchaining.WordnetWord;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import edu.mit.jwi.item.IIndexWord;
+import edu.mit.jwi.item.POS;
+
+import java.util.Collections;
+import java.util.Hashtable;
+import java.util.List;
+
+public class LexChainTest {
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ }
+
+
+ @Test
+ public void testBuildLexicalChains() {
+ try {
+ /*
+ String article = "US President Barack Obama has welcomed an agreement between the US and Russia under which Syria's chemical weapons must be destroyed or removed by mid-2014 as an \"important step\"."
+ + "But a White House statement cautioned that the US expected Syria to live up to its public commitments. "
+ + "The US-Russian framework document stipulates that Syria must provide details of its stockpile within a week. "
+ + "If Syria fails to comply, the deal could be enforced by a UN resolution. "
+ + " China, France, the UK, the UN and Nato have all expressed satisfaction at the agreement. "
+ + " In Beijing, Foreign Minister Wang Yi said on Sunday that China welcomes the general agreement between the US and Russia.";
+*/
+ String sentFragModel = "resources/en-sent.bin";
+ DefaultDocProcessor dp =new DefaultDocProcessor(sentFragModel);
+ String article = dp.docToString("/Users/ram/dev/summarizer/test/forram/technology/output/summary/9.txt");
+ LexicalChainingSummarizer lcs;
+ lcs = new LexicalChainingSummarizer(dp,"resources/en-pos-maxent.bin");
+
+ long strt = System.currentTimeMillis();
+
+ List<Sentence> sent = dp.getSentencesFromStr(article);
+ List<LexicalChain> vh = lcs.buildLexicalChains(article, sent);
+ Collections.sort(vh);
+
+ List<Sentence> s = dp.getSentencesFromStr(article);
+ Hashtable<String, Boolean> comp = new Hashtable<String, Boolean>();
+ System.out.println(vh.size());
+ POSTagger t = new OpenNLPPOSTagger(dp,"resources/en-pos-maxent.bin");
+ System.out.println(t.getTaggedString(article));
+ for(int i=vh.size()-1;i>=Math.max(vh.size()-50, 0);i--)
+ {
+ LexicalChain lc = vh.get(i);
+
+ if(! (comp.containsKey(lc.getWord().get(0).getLexicon())))
+ {
+ comp.put(lc.getWord().get(0).getLexicon(), new Boolean(true));
+ for(int j=0;j<lc.getWord().size();j++)
+ System.out.print(lc.getWord().get(j) + "-- ");
+ System.out.println(lc.score());
+ for(Sentence sid : lc.getSentences())
+ {
+ //if(sid>=0 && sid<s.size())
+ System.out.println(sid);
+ }
+ }
+ System.out.println("--------");
+ }
+ System.out.println((System.currentTimeMillis() - strt)/1000);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+ @Test
+ public void testGetRelation() {
+ try {
+
+ WordRelationshipDetermination lcs = new WordRelationshipDetermination();
+ LexicalChain l = new LexicalChain();
+ List<Word> words = lcs.getWordSenses("music");
+
+ l.addWord(words.get(0));
+// int rel = lcs.getRelation(l, "nation");
+ WordRelation rel2 = lcs.getRelation(l, "tune", true);
+ WordRelation rel3 = lcs.getRelation(l, "vocal", true);
+ System.out.println(rel2.relation);
+ System.out.println(rel3.relation);
+ // assertEquals(rel, LexicalChainingSummarizer.STRONG_RELATION);
+ assertEquals( WordRelation.MED_RELATION, rel2.relation);
+ assertEquals( WordRelation.MED_RELATION, rel3.relation);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
diff --git a/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java b/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java
new file mode 100644
index 0000000..bb9ef9b
--- /dev/null
+++ b/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package unittests;
+
+import static org.junit.Assert.*;
+
+import java.util.List;
+
+import opennlp.summarization.Sentence;
+import opennlp.summarization.lexicalchaining.LexChainingKeywordExtractor;
+import opennlp.summarization.lexicalchaining.LexicalChain;
+import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class LexChainingKeywordExtractorTest {
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ }
+
+ @Test
+ public void testGetKeywords() {
+ try {
+ String sentFragModel = "resources/en-sent.bin";
+ DefaultDocProcessor dp =new DefaultDocProcessor(sentFragModel);
+ String article = dp.docToString("/Users/ram/dev/summarizer/test/forram/topnews/input/8.txt");
+ LexicalChainingSummarizer lcs;
+ lcs = new LexicalChainingSummarizer(dp,"resources/en-pos-maxent.bin");
+
+ long strt = System.currentTimeMillis();
+
+ List<Sentence> sent = dp.getSentencesFromStr(article);
+ List<LexicalChain> vh = lcs.buildLexicalChains(article, sent);
+ LexChainingKeywordExtractor ke = new LexChainingKeywordExtractor();
+ List<String> keywords = ke.getKeywords(vh, 5);
+ //lazy
+ System.out.println(keywords);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+}