OPENNLP-1594 Add stricter tests for Summarizer component (#158)
- adds further, stricter tests
- clarifies, at API level, the semantics and constraints of parameters
- separates tests so that each test class has a clear responsibility for its class under test
- removes binary model files from test/resources folder
- improves / enhances the JavaDoc further
diff --git a/summarizer/pom.xml b/summarizer/pom.xml
index 19237f3..2c4da8f 100644
--- a/summarizer/pom.xml
+++ b/summarizer/pom.xml
@@ -31,10 +31,18 @@
<name>Apache OpenNLP Summarizer</name>
<properties>
+ <wordnet.version>2.4.0</wordnet.version>
<wordnet-dict.version>3.1</wordnet-dict.version>
<maven.download.plugin>1.9.0</maven.download.plugin>
</properties>
+ <repositories>
+ <repository>
+ <id>maven.aksw.org</id>
+ <url>https://maven.aksw.org/repository/internal/</url>
+ <releases/>
+ </repository>
+ </repositories>
<dependencies>
<dependency>
@@ -45,7 +53,7 @@
<dependency>
<groupId>edu.mit</groupId>
<artifactId>jwi</artifactId>
- <version>2.2.3</version>
+ <version>${wordnet.version}</version>
</dependency>
<dependency>
diff --git a/summarizer/src/main/java/opennlp/summarization/DocProcessor.java b/summarizer/src/main/java/opennlp/summarization/DocProcessor.java
index 65a992f..756744f 100644
--- a/summarizer/src/main/java/opennlp/summarization/DocProcessor.java
+++ b/summarizer/src/main/java/opennlp/summarization/DocProcessor.java
@@ -31,12 +31,20 @@
public interface DocProcessor {
/**
- * Extracts sentences from a string representing an article.
+ * Extracts {@link Sentence sentences} from a string representing an article.
+ *
+ * @param text The text to process; if {@code null} or empty, an empty list is returned.
+ *
+ * @return The resulting list of detected {@link Sentence sentences}.
*/
- List<Sentence> getSentencesFromStr(String text);
+ List<Sentence> getSentences(String text);
/**
- * Parses out words from a specified {@link String sent}.
+ * Extracts words from a specified {@link String sent}.
+ *
+ * @param sent The sentence to process; if {@code null} or empty, an zero length array is returned.
+ *
+ * @return An array of tokens (words) contained in the given {@code sent}.
*/
String[] getWords(String sent);
diff --git a/summarizer/src/main/java/opennlp/summarization/Score.java b/summarizer/src/main/java/opennlp/summarization/Score.java
index 76a2694..80751d6 100755
--- a/summarizer/src/main/java/opennlp/summarization/Score.java
+++ b/summarizer/src/main/java/opennlp/summarization/Score.java
@@ -18,14 +18,15 @@
package opennlp.summarization;
/**
- * Stores the score of a sentence for ranking sentences within a document.
+ * Encapsulates the score of a sentence for the purpose of ranking sentences within a document.
*/
public class Score implements Comparable<Score> {
private int sentId;
private double score;
- public Score() {
- score = 0;
+ public Score(int sentId, double score) {
+ this.sentId = sentId;
+ this.score = score;
}
public int getSentId() {
@@ -46,7 +47,6 @@
@Override
public int compareTo(Score o) {
-
if (o.score > score) return 1;
else if (o.score < score) return -1;
return 0;
diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java
index a158199..2c03eef 100755
--- a/summarizer/src/main/java/opennlp/summarization/Sentence.java
+++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java
@@ -32,34 +32,44 @@
public class Sentence {
private static final String SPACE = " ";
- private final List<Sentence> links;
+ private final List<Sentence> links = new ArrayList<>();
+ private final int sentId;
+
// sentId is always position of sentence in doc.
- private int sentId;
private String stringVal;
private Score pageRankScore;
private int paragraph;
private int paraPos;
private boolean hasQuote;
- private double wordWt = 0;
- private int wordCnt;
+ private double wordWeight = 0;
+ private int wordCound = 0;
- public Sentence() {
- links = new ArrayList<>();
- }
+ /**
+ * Instantiates a plain {@link Sentence} via a set of parameters.
+ *
+ * @param id A numeric identifier with a postive value.
+ * @param stringVal The string representation of the sentence.
+ * @param paragraph TODO clarify exact meaning of and constraints for this parameter.
+ * @param paraPos clarify exact meaning of and constraints for this parameter.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public Sentence(int id, String stringVal, int paragraph, int paraPos) {
+ if (id < 0) throw new IllegalArgumentException("Parameter 'id' cannot be negative");
+ if (stringVal == null || stringVal.isBlank())
+ throw new IllegalArgumentException("Parameter 'stringVal' must not be null");
+ if (paragraph < 0) throw new IllegalArgumentException("Parameter 'paragraph' cannot be negative");
+ if (paraPos < 0) throw new IllegalArgumentException("Parameter 'paraPos' cannot be negative");
- public Sentence(int id) {
- this();
this.sentId = id;
- }
+ setParagraph(paragraph);
+ setStringVal(stringVal);
+ setParaPos(paraPos);
+ };
public int getSentId() {
return sentId;
}
- public void setSentId(int sentId) {
- this.sentId = sentId;
- }
-
public Score getPageRankScore() {
return pageRankScore;
}
@@ -84,7 +94,7 @@
this.paraPos = paraPos;
}
- private int calcWrdCnt(String stringVal2) {
+ private int calcWordCount(String stringVal2) {
int ret = 0;
StopWords sw = StopWords.getInstance();
String[] wrds = stringVal.split("\\s+");
@@ -102,7 +112,7 @@
public void setStringVal(String stringVal) {
this.stringVal = stringVal;
if (stringVal.contains("\"")) this.hasQuote = true;
- this.wordCnt = calcWrdCnt(stringVal);
+ this.wordCound = calcWordCount(stringVal);
}
public void addLink(Sentence s) {
@@ -113,38 +123,21 @@
return this.links;
}
- public double getWordWt() {
- return wordWt;
+ public double getWordWeight() {
+ return wordWeight;
}
- public void setWordWt(double wordWt) {
- this.wordWt = wordWt;
+ public void setWordWeight(double wordWt) {
+ this.wordWeight = wordWt;
}
- public int getWordCnt() {
- return wordCnt == 0 ? this.getStringVal().split("\\s+").length : wordCnt;
+ public int getWordCount() {
+ return wordCound;
}
- // Should add an article id to the sentence class. For now returns true if the ids are the same.
-
- @Override
- public final boolean equals(Object o) {
- if (this == o) return true;
- if (!(o instanceof Sentence sentence)) return false;
-
- return sentId == sentence.sentId;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(sentId);
- }
-
- @Override
- public String toString() {
- return this.stringVal;//+ "("+ this.paragraph +", "+this.paraPos+")";
- }
-
+ /**
+ * @return Applies stemming to each word and returns a fully-stemmed representation of a sentence.
+ */
public String stem() {
PorterStemmer stemmer = new PorterStemmer();
StopWords sw = StopWords.getInstance();
@@ -167,4 +160,23 @@
}
return b.toString();
}
+
+ // Should add an article id to the sentence class. For now returns true if the ids are the same.
+ @Override
+ public final boolean equals(Object o) {
+ if (this == o) return true;
+ if (!(o instanceof Sentence sentence)) return false;
+
+ return sentId == sentence.sentId;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(sentId);
+ }
+
+ @Override
+ public String toString() {
+ return this.stringVal; // + "("+ this.paragraph +", "+this.paraPos+")";
+ }
}
diff --git a/summarizer/src/main/java/opennlp/summarization/Summarizer.java b/summarizer/src/main/java/opennlp/summarization/Summarizer.java
index e3ae124..8271868 100644
--- a/summarizer/src/main/java/opennlp/summarization/Summarizer.java
+++ b/summarizer/src/main/java/opennlp/summarization/Summarizer.java
@@ -17,15 +17,18 @@
package opennlp.summarization;
+/**
+ * Describes the API of a component which summarizes the content of news, articles or books.
+ */
public interface Summarizer {
/**
- * Summarizes a given {@code article}. The length of the summary is
+ * Summarizes a given {@code text}. The length of the summary is
* influenced by the specified {@code maxWords} parameter.
*
- * @param article The text to summarize. Must not be {@code null} and not be blank.
+ * @param text The content to summarize. Must not be {@code null} and not be blank.
* @param maxWords The maximum number of words. Must be larger than {@code zero}.
* @return The summary or an {@code empty} String if no summary could be derived.
*/
- String summarize(String article, int maxWords);
+ String summarize(String text, int maxWords);
}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java
deleted file mode 100644
index a313928..0000000
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.summarization.lexicalchaining;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Uses the lexical chaining algorithm to extract keywords.
- */
-public class LexChainingKeywordExtractor {
-
- // Simple logic to pull out the keyword based on longest lexical chains..
- public List<String> getKeywords(List<LexicalChain> lexicalChains, int noOfKeywords) {
- Collections.sort(lexicalChains);
- List<String> ret = new ArrayList<>();
- for (int i = 0; i < Math.min(lexicalChains.size(), noOfKeywords); i++) {
- List<Word> words = lexicalChains.get(i).getWord();
- if (!words.isEmpty() && !ret.contains(words.get(0).getLexicon())) {
- ret.add(words.get(0).getLexicon());
- }
- }
- return ret;
- }
-}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java
index 3da83e3..a7b3cb2 100644
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java
@@ -22,17 +22,24 @@
import opennlp.summarization.Sentence;
+/**
+ * Represents a lexical chain.
+ */
public class LexicalChain implements Comparable<LexicalChain> {
- final List<Word> word;
- final List<Sentence> sentences;
- int start, last;
- int score;
+ private final List<Word> words = new ArrayList<>();
+ private final List<Sentence> sentences = new ArrayList<>();
+ private int score;
+
+ int start;
+ int last;
int occurrences = 1;
public LexicalChain() {
- word = new ArrayList<>();
- sentences = new ArrayList<>();
+ }
+
+ public LexicalChain(int start) {
+ this.start = start;
}
public double score() {
@@ -40,7 +47,7 @@
}
public int length() {
- return word.size();
+ return words.size();
}
public float homogeneity() {
@@ -48,7 +55,7 @@
}
public void addWord(Word w) {
- word.add(w);
+ words.add(w);
}
public void addSentence(Sentence sent) {
@@ -56,8 +63,8 @@
sentences.add(sent);
}
- public List<Word> getWord() {
- return word;
+ public List<Word> getWords() {
+ return words;
}
public List<Sentence> getSentences() {
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingKeywordExtractor.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingKeywordExtractor.java
new file mode 100644
index 0000000..0c42897
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingKeywordExtractor.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Uses the {@link LexicalChain lexical chaining} algorithm to extract keywords.
+ *
+ * @see LexicalChain
+ */
+public class LexicalChainingKeywordExtractor {
+
+ /**
+ * Extracts keywords from a list of {@link LexicalChain lexical chains}, limited by {@code noOfKeywords}.
+ *
+ * @param lexicalChains The {@link LexicalChain lexical chains} to process. Must not be {@code null}.
+ * @param noOfKeywords The upper limit of keywords. Must be greater than {@code zero}.
+ *
+ * @return The extracted keywords as a list. Guaranteed to be not {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ * @implNote This operation is based on longest lexical chains.
+ */
+ public List<String> extractKeywords(List<LexicalChain> lexicalChains, int noOfKeywords) {
+ if (lexicalChains == null) {
+ throw new IllegalArgumentException("Parameter 'lexicalChains' must not be null.");
+ }
+ if (noOfKeywords <= 0) {
+ throw new IllegalArgumentException("Parameter 'noOfKeywords' must be greater than 0.");
+ }
+ if (lexicalChains.isEmpty()) {
+ return Collections.emptyList();
+ } else {
+ Collections.sort(lexicalChains);
+ List<String> ret = new ArrayList<>();
+ for (int i = 0; i < Math.min(lexicalChains.size(), noOfKeywords); i++) {
+ List<Word> words = lexicalChains.get(i).getWords();
+ if (!words.isEmpty()) {
+ Word w = words.get(0);
+ if (!ret.contains(w.getLexicon())) {
+ ret.add(w.getLexicon());
+ }
+ }
+ }
+ return ret;
+ }
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java
index f243d69..f748230 100755
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java
@@ -17,7 +17,7 @@
package opennlp.summarization.lexicalchaining;
-import java.io.InputStream;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Hashtable;
@@ -26,15 +26,19 @@
import opennlp.summarization.DocProcessor;
import opennlp.summarization.Sentence;
import opennlp.summarization.Summarizer;
+import opennlp.tools.postag.POSModel;
/**
- * Implements the algorithm outlined in - "Summarization Using Lexical Chains" by R. Berzilay et al.
- * <p>
+ * Implements a {@link Summarizer summarization} algorithm outlined in: <br/>
+ * <a href="https://aclanthology.org/W97-0703.pdf">
+ * "Summarization Using Lexical Chains"</a>, by Regina Berzilay and Michael Elhadad.
+ * <br/><br/>
* The algorithm is based on extracting so-called lexical chains - a set of sentences in the article
- * that share a word that are very closely related. Thus, the longest chain represents the most important
+ * that share a {@link Word} that are very closely related. Thus, the longest chain represents the most important
* topic and so forth. A summary can then be formed by identifying the most important lexical chains
* and "pulling" out sentences from them.
*
+ * @see Word
* @see LexicalChain
* @see Summarizer
*/
@@ -44,87 +48,120 @@
private final DocProcessor docProcessor;
private final WordRelationshipDetermination wordRel;
- public LexicalChainingSummarizer(DocProcessor dp, OpenNLPPOSTagger posTagger) {
- docProcessor = dp;
+ /**
+ * Instantiates a {@link LexicalChainingSummarizer}.
+ *
+ * @param docProcessor The {@link DocProcessor} to use at runtime. Must not be {@code null}.
+ * @param languageCode An ISO-language code for obtaining a {@link POSModel}.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public LexicalChainingSummarizer(DocProcessor docProcessor, String languageCode) throws IOException {
+ this(docProcessor, new NounPOSTagger(languageCode));
+ }
+
+ /**
+ * Instantiates a {@link LexicalChainingSummarizer}.
+ *
+ * @param docProcessor The {@link DocProcessor} to use at runtime. Must not be {@code null}.
+ * @param posTagger The {@link NounPOSTagger} to use at runtime. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public LexicalChainingSummarizer(DocProcessor docProcessor, NounPOSTagger posTagger) {
+ if (docProcessor == null) throw new IllegalArgumentException("Parameter 'docProcessor' must not be null!");
+ if (posTagger == null) throw new IllegalArgumentException("Parameter 'posTagger' must not be null!");
+
+ this.docProcessor = docProcessor;
tagger = posTagger;
wordRel = new WordRelationshipDetermination();
}
- public LexicalChainingSummarizer(DocProcessor dp, InputStream posModelFile) throws Exception {
- this(dp, new OpenNLPPOSTagger(dp, posModelFile));
- }
-
- //Build Lexical chains..
- public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent) {
- // POS tag article
- Hashtable<String, List<LexicalChain>> chains = new Hashtable<>();
- List<LexicalChain> lc = new ArrayList<>();
- // Build lexical chains
- // For each sentence
- for (Sentence currSent : sent) {
- String taggedSent = tagger.getTaggedString(currSent.getStringVal());
- List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
- // For each noun
- for (String noun : nouns) {
- int chainsAddCnt = 0;
- // Loop through each LC
- for (LexicalChain l : lc) {
- try {
- WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start) > 7);
- // Is the noun an exact match to one of the current LCs (Strong relation)
- // Add sentence to chain
- if (rel.relation() == WordRelation.STRONG_RELATION) {
- addToChain(rel.dest(), l, chains, currSent);
- if (currSent.getSentId() - l.last > 10) {
- l.occurrences++;
- l.start = currSent.getSentId();
- }
- chainsAddCnt++;
- } else if (rel.relation() == WordRelation.MED_RELATION) {
- // Add sentence to chain if it is 7 sentences away from start of chain
- addToChain(rel.dest(), l, chains, currSent);
- chainsAddCnt++;
- //If greater than 7 we will add it but call it a new occurrence of the lexical chain...
- if (currSent.getSentId() - l.start > 7) {
- l.occurrences++;
- l.start = currSent.getSentId();
- }
- } else if (rel.relation() == WordRelation.WEAK_RELATION) {
- if (currSent.getSentId() - l.start <= 3) {
+ /**
+ * Constructs a list of {@link LexicalChain lexical chains} from specified sentences.
+ *
+ * @param sentences The list of {@link Sentence sentences} to build lexical chains from.
+ * Must not be {@code null}.
+ * @return The result list of {@link LexicalChain lexical chains}. Guaranteed to be not {@code null}.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public List<LexicalChain> buildLexicalChains(List<Sentence> sentences) {
+ if (sentences == null) throw new IllegalArgumentException("Parameter 'sentences' must not be null!");
+ else {
+ if (sentences.isEmpty()) {
+ return Collections.emptyList();
+ }
+ Hashtable<String, List<LexicalChain>> chains = new Hashtable<>();
+ List<LexicalChain> lc = new ArrayList<>();
+ // Build lexical chains
+ // For each sentence
+ for (Sentence currSent : sentences) {
+ // POS tag article
+ String taggedSent = tagger.getTaggedString(currSent.getStringVal().replace(".", " ."));
+ List<String> nouns = tagger.getWordsOfType(docProcessor.getWords(taggedSent), POSTagger.NOUN);
+ // For each noun
+ for (String noun : nouns) {
+ int chainsAddCnt = 0;
+ // Loop through each LC
+ for (LexicalChain l : lc) {
+ try {
+ WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start) > 7);
+ // Is the noun an exact match to one of the current LCs (Strong relation)
+ // Add sentence to chain
+ if (rel.relation() == WordRelation.STRONG_RELATION) {
+ addToChain(rel.dest(), l, chains, currSent);
+ if (currSent.getSentId() - l.last > 10) {
+ l.occurrences++;
+ l.start = currSent.getSentId();
+ }
+ chainsAddCnt++;
+ } else if (rel.relation() == WordRelation.MED_RELATION) {
+ // Add sentence to chain if it is 7 sentences away from start of chain
addToChain(rel.dest(), l, chains, currSent);
chainsAddCnt++;
+ // If greater than 7 we will add it but call it a new occurrence of the lexical chain...
+ if (currSent.getSentId() - l.start > 7) {
+ l.occurrences++;
+ l.start = currSent.getSentId();
+ }
+ } else if (rel.relation() == WordRelation.WEAK_RELATION) {
+ if (currSent.getSentId() - l.start <= 3) {
+ addToChain(rel.dest(), l, chains, currSent);
+ chainsAddCnt++;
+ }
}
+ } catch (Exception ex) {
+ throw new RuntimeException(ex);
}
- } catch (Exception ex) {
+ // add sentence and update last occurrence..
+ //chaincnt++
+ // else 1 hop-relation in Wordnet (weak relation)
+ // Add sentence to chain if it is 3 sentences away from start of chain
+ //chaincnt++
+ // End loop LC
}
- // add sentence and update last occurrence..
- //chaincnt++
- // else 1 hop-relation in Wordnet (weak relation)
- // Add sentence to chain if it is 3 sentences away from start of chain
- //chaincnt++
- // End loop LC
- }
- //Could not add the word to any existing list. Start a new lexical chain with the word.
- if (chainsAddCnt == 0) {
- List<Word> senses = wordRel.getWordSenses(noun);
- for (Word w : senses) {
- LexicalChain newLc = new LexicalChain();
- newLc.start = currSent.getSentId();
- addToChain(w, newLc, chains, currSent);
- lc.add(newLc);
+ // Could not add the word to any existing list. Start a new lexical chain with the word.
+ if (chainsAddCnt == 0) {
+ List<Word> senses = wordRel.getWordSenses(noun);
+ for (Word w : senses) {
+ LexicalChain newLc = new LexicalChain(currSent.getSentId());
+ addToChain(w, newLc, chains, currSent);
+ lc.add(newLc);
+ }
}
+ if (lc.size() > 20)
+ purge(lc, currSent.getSentId(), sentences.size());
}
- if (lc.size() > 20)
- purge(lc, currSent.getSentId(), sent.size());
+ //End sentence
}
- //End sentence
- }
// disambiguateAndCleanChains(lc, chains);
- // Calculate score
- // Length of chain * homogeneity
- //sort LC by strength.
- return lc;
+ // Calculate score
+ // Length of chain * homogeneity
+ //sort LC by strength.
+ return lc;
+ }
}
/*
@@ -132,7 +169,7 @@
* Takes care to only remove small chains that were added "long back"
*/
private void purge(List<LexicalChain> lc, int sentId, int totSents) {
- //Do nothing for the first 50 sentences.
+ //Do nothing for the first 20 sentences.
if (lc.size() < 20) return;
Collections.sort(lc);
@@ -146,12 +183,12 @@
LexicalChain l = lc.get(i);
if (l.score() < cutOff && (sentId - l.last) > totSents / 3)// && containsAllWords(words, l.word))
toRem.add(l);
- //A different sense and added long back.
- else if (words.containsKey(l.getWord().get(0).getLexicon()) && (sentId - l.start) > totSents / 10)
+ // A different sense and added long back.
+ else if (words.containsKey(l.getWords().get(0).getLexicon()) && (sentId - l.start) > totSents / 10)
toRem.add(l);
else {
- //Check if this is from a word with different sense..
- for (Word w : l.word)
+ // Check if this is from a word with different sense..
+ for (Word w : l.getWords())
words.put(w.getLexicon(), Boolean.TRUE);
}
}
@@ -169,9 +206,7 @@
return ret;
}
- private void addToChain(Word noun, LexicalChain l,
- Hashtable<String, List<LexicalChain>> chains, Sentence sent) {
-
+ private void addToChain(Word noun, LexicalChain l, Hashtable<String, List<LexicalChain>> chains, Sentence sent) {
l.addWord(noun);
l.addSentence(sent);
l.last = sent.getSentId();
@@ -182,20 +217,18 @@
@Override
public String summarize(String article, int maxWords) {
- List<Sentence> sent = docProcessor.getSentencesFromStr(article);
- List<LexicalChain> lc = buildLexicalChains(article, sent);
+ List<Sentence> sent = docProcessor.getSentences(article);
+ List<LexicalChain> lc = buildLexicalChains(sent);
Collections.sort(lc);
int summSize = 0;
List<Sentence> summ = new ArrayList<>();
StringBuilder sb = new StringBuilder();
- for (int i = 0; i < lc.size(); i++) {
- LexicalChain chain = lc.get(i);
- for (int j = 0; j < chain.sentences.size(); j++) {
- Sentence candidate = chain.sentences.get(j);
+ for (LexicalChain chain : lc) {
+ for (Sentence candidate : chain.getSentences()) {
if (!summ.contains(candidate)) {
summ.add(candidate);
sb.append(candidate.getStringVal()).append(" ");
- summSize += candidate.getWordCnt();
+ summSize += candidate.getWordCount();
break;
}
}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java
new file mode 100644
index 0000000..2acc60b
--- /dev/null
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.summarization.lexicalchaining;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.DownloadUtil;
+
+/**
+ * A {@link POSTagger} wrapper implementation that relies on an OpenNLP {@link POSTaggerME}.
+ *
+ * @see POSTagger
+ * @see POSTaggerME
+ */
+public class NounPOSTagger implements POSTagger {
+
+ public static final String[] TAGS_NOUNS = {"NOUN", "NN", "NNS", "NNP", "NNPS"};
+ private static final Set<String> EOS_CHARS = Set.of(".", "?", "!");
+
+ private final POSTaggerME tagger;
+ private final Map<Integer, String[]> tagMap = new Hashtable<>();
+
+ /**
+ * Instantiates a {@link NounPOSTagger} for a POS model for the specified {@code languageCode}.
+ *
+ * @param languageCode An ISO-language code for obtaining a {@link POSModel}.
+ * Must not be {@code null}.
+ * @throws IOException Thrown if IO errors occurred.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public NounPOSTagger(String languageCode) throws IOException {
+ if (languageCode == null || languageCode.isBlank())
+ throw new IllegalArgumentException("Parameter 'languageCode' must not be null");
+ // init Tag map
+ tagMap.put(POSTagger.NOUN, TAGS_NOUNS);
+ POSModel posModel = DownloadUtil.downloadModel(languageCode, DownloadUtil.ModelType.POS, POSModel.class);
+ tagger = new POSTaggerME(posModel);
+ }
+
+ /**
+ * @return {@code true} if the type string belongs to one of the (noun) tags for the type,
+ * {@code false} otherwise.
+ */
+ public boolean isType(String typeStr, int type) {
+ boolean ret = false;
+ String[] tags = tagMap.get(type);
+ if (tags != null) {
+ for (String tag : tags) {
+ if (typeStr.equalsIgnoreCase(tag)) {
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String getTaggedString(String input) {
+ if (input == null) throw new IllegalArgumentException("Parameter 'input' must not be null");
+
+ String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(input);
+ String[] tags = tagger.tag(tokens);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < tokens.length; i++) {
+ sb.append(tokens[i]).append("/").append(tags[i]);
+ // whitespace appending only for non-EOS / PUNCT tokens, skipping for actual EOS tokens
+ if (! (EOS_CHARS.contains(tokens[i]) && tokens.length == i + 1)) {
+ sb.append(" ");
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public List<String> getWordsOfType(String[] tokens, int type) {
+ if (tokens == null) throw new IllegalArgumentException("Parameter 'tokens' must not be null");
+ if (type < 0 || type > PRONOUN) throw new IllegalArgumentException("Parameter 'type' must be in range [0, 4]");
+
+ List<String> ret = new ArrayList<>();
+ for (String t : tokens) {
+ String[] wordPlusType = t.split("/");
+ if (wordPlusType.length == 2) {
+ if (isType(wordPlusType[1], type))
+ ret.add(wordPlusType[0]);
+ } else {
+ throw new IllegalArgumentException("Token '" + t + "' is not tagged correctly!");
+ }
+ }
+ // log.info(ret.toString());
+ return ret;
+ }
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java
deleted file mode 100644
index 39edde3..0000000
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.summarization.lexicalchaining;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Hashtable;
-import java.util.List;
-
-import opennlp.summarization.DocProcessor;
-import opennlp.tools.postag.POSModel;
-import opennlp.tools.postag.POSTaggerME;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
-
-public class OpenNLPPOSTagger implements POSTagger {
-
- private final POSTaggerME tagger;
- private final DocProcessor dp;
- private final String[] nounTags = {"NOUN", "NN", "NNS", "NNP", "NNPS"};
- private Hashtable<Integer, String[]> tagMap;
-
- public OpenNLPPOSTagger(DocProcessor dp, InputStream posModelFile) throws IOException {
- this.dp = dp;
- initTagMap();
-
- try (InputStream modelIn = new BufferedInputStream(posModelFile)) {
- POSModel model = new POSModel(modelIn);
- tagger = new POSTaggerME(model);
- }
- }
-
- private void initTagMap() {
- tagMap = new Hashtable<>();
- tagMap.put(POSTagger.NOUN, nounTags);
- }
-
- // Returns true if the type string belongs to one of the tags for the type
- public boolean isType(String typeStr, int type) {
- boolean ret = false;
- String[] tags = tagMap.get(type);
- for (String tag : tags) {
- if (typeStr.equalsIgnoreCase(tag)) {
- ret = true;
- break;
- }
- }
- return ret;
- }
-
- @Override
- public String getTaggedString(String input) {
- String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(input);
- String[] tags = tagger.tag(tokens);
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < tokens.length; i++) {
- sb.append(tokens[i]).append("/").append(tags[i]).append(" ");
- }
- return sb.toString();
- }
-
- @Override
- public List<String> getWordsOfType(String sent, int type) {
- List<String> ret = new ArrayList<>();
- String[] tokens = dp.getWords(sent);
- for (String t : tokens) {
- String[] wordPlusType = t.split("/");
- if (wordPlusType.length == 2) {
- if (isType(wordPlusType[1], type))
- ret.add(wordPlusType[0]);
- }
- }
- // log.info(ret.toString());
- return ret;
- }
-}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java
index d6b5d2d..3926181 100644
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java
@@ -19,6 +19,10 @@
import java.util.List;
+/**
+ * A basic POS tagger which describes functionality to tag text and
+ * filter tokens for certain word classes.
+ */
public interface POSTagger {
//Tagger types..
@@ -28,7 +32,26 @@
int ADVERB = 3;
int PRONOUN = 4;
+ /**
+ * Tags a given {@code input} text so that word classes are appended to each token.
+ *
+ * @param input The text to process. Must not be {@code null}. If empty, an empty String is returned.
+ * @return The POS tagged text. May be empty.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
String getTaggedString(String input);
- List<String> getWordsOfType(String sent, int type);
+ /**
+ * Extracts words from POS-tagged {@code tokens} which equal a certain word class ({@code type}).
+ *
+ * @param tokens An array of words to filter for its word class ({@code type}). Must not be {@code null}.
+ * Must be in a tagged form, that is, separated into {@code token/word-class} pairs.
+ * @param type One of the supported types: {@link #NOUN}, {@link #VERB}, {@link #ADJECTIVE},
+ * {@link #ADVERB}, or {@link #PRONOUN}. Must not be less than {@code zero}
+ * and not be more than {@link #PRONOUN}.
+ * @return A list of words that match the given {@code type}. May be empty, yet guaranteed to be non-{@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ List<String> getWordsOfType(String[] tokens, int type);
}
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
index ebe352f..9079b62 100644
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
@@ -58,7 +58,7 @@
try {
DICTIONARY.open();
} catch (IOException e) {
- e.printStackTrace();
+ throw new RuntimeException(e);
}
}
@@ -130,10 +130,7 @@
WordnetWord ww = (WordnetWord) w;
IWord syn;
if ((syn = this.isSynonym(noun, w)) != null) {
- ret = new WordnetWord();
- ret.lexicon = noun;
- ret.id = syn.getID();
- ret.wordSense = syn.getSenseKey();
+ ret = new WordnetWord(noun, syn.getSenseKey(), syn.getID());
}
//Construct an IWord object representing word associated with wordID
@@ -156,10 +153,7 @@
ISynset s = this.DICTIONARY.getSynset(id);
IWord mat = inSynset(s, idxNoun);
if (mat != null) {
- ret = new WordnetWord();
- ret.lexicon = noun;
- ret.id = mat.getID();
- ret.wordSense = mat.getSenseKey();
+ ret = new WordnetWord(noun, mat.getSenseKey(), mat.getID());
break;
}
}
@@ -175,7 +169,7 @@
*/
public WordRelation getRelation(LexicalChain l, String noun, boolean checkMed) {
WordRelation ret = new WordRelation(null, null, WordRelation.NO_RELATION);
- for (Word w : l.word) {
+ for (Word w : l.getWords()) {
//Exact match is a string relation.
if (w.getLexicon().equalsIgnoreCase(noun)) {
ret = new WordRelation(w, w, WordRelation.STRONG_RELATION);
@@ -199,15 +193,12 @@
// openDict();
List<IWordID> wordIDs = this.DICTIONARY.getIndexWord(noun, POS.NOUN).getWordIDs();
for (IWordID wid : wordIDs) {
- Word w = new WordnetWord();
- w.setLexicon(noun);
- w.setID(wid);
+ Word w = new WordnetWord(noun, wid);
ret.add(w);
}
} catch (Exception ex) {
//Not in dictionary
- Word w = new WordnetWord();
- w.setLexicon(noun);
+ Word w = new WordnetWord(noun);
ret.add(w);
}
return ret;
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java
index a110719..0cf026d 100644
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java
@@ -18,6 +18,7 @@
import java.util.Hashtable;
import java.util.List;
+import java.util.Objects;
import edu.mit.jwi.item.IPointer;
import edu.mit.jwi.item.ISenseKey;
@@ -25,16 +26,58 @@
import edu.mit.jwi.item.ISynsetID;
import edu.mit.jwi.item.IWordID;
+/**
+ * A {@link Word} implementation based on Wordnet concepts.
+ */
public class WordnetWord implements Word {
- final Hashtable<IPointer, List<ISynsetID>> rels;
- String lexicon;
- ISenseKey wordSense;
- IWordID id;
- //Cache..
+
+ private String lexicon;
+ private IWordID id;
+ private ISenseKey wordSense;
+
+ final Hashtable<IPointer, List<ISynsetID>> rels = new Hashtable<>();
+ // Cache..
ISynset synonyms;
- public WordnetWord() {
- rels = new Hashtable<>();
+ /**
+ * Instantiates a {@link WordnetWord} via its lexicon term.
+ *
+ * @param lexicon Must not be {@code null} and not be an empty string.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public WordnetWord(String lexicon) {
+ if (lexicon == null || lexicon.isBlank()) throw new IllegalArgumentException("parameter 'lexicon' must not be null or empty");
+ setLexicon(lexicon);
+ }
+
+ /**
+ * Instantiates a {@link WordnetWord} via its lexicon term and a {@link IWordID}.
+ *
+ * @param lexicon Must not be {@code null} and not be an empty string.
+ * @param id A unique identifier sufficient to retrieve a particular word from the Wordnet database.
+ * Must not be {@code null}.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public WordnetWord(String lexicon, IWordID id) {
+ this(lexicon);
+ if (id == null) throw new IllegalArgumentException("parameter 'id' must not be null");
+ setID(id);
+ }
+
+ /**
+ * Instantiates a {@link WordnetWord} via its lexicon term and a {@link IWordID}.
+ *
+ * @param lexicon Must not be {@code null} and not be an empty string.
+ * @param wordSense A sense key is a unique string that identifies a Wordnet word.
+ * Must not be {@code null}.
+ * @param id A unique identifier sufficient to retrieve a particular word from the Wordnet database.
+ * Must not be {@code null}.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public WordnetWord(String lexicon, ISenseKey wordSense, IWordID id) {
+ this(lexicon, id);
+ if (wordSense == null) throw new IllegalArgumentException("parameter 'wordSense' must not be null");
+ setSense(wordSense);
}
@Override
@@ -73,7 +116,17 @@
}
@Override
+ public final boolean equals(Object o) {
+ if (this == o) return true;
+ if (!(o instanceof WordnetWord that)) return false;
+
+ return Objects.equals(lexicon, that.lexicon) && Objects.equals(id, that.id);
+ }
+
+ @Override
public int hashCode() {
- return toString().hashCode();
+ int result = Objects.hashCode(lexicon);
+ result = 31 * result + Objects.hashCode(id);
+ return result;
}
}
diff --git a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
index 7fa1155..113dcec 100644
--- a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
+++ b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
@@ -28,7 +28,7 @@
import opennlp.summarization.Summarizer;
import opennlp.summarization.lexicalchaining.LexicalChain;
import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
-import opennlp.summarization.lexicalchaining.OpenNLPPOSTagger;
+import opennlp.summarization.lexicalchaining.NounPOSTagger;
import opennlp.summarization.textrank.TextRankSummarizer;
import opennlp.summarization.DocProcessor;
@@ -47,7 +47,7 @@
private final TextRankSummarizer textRank;
private final LexicalChainingSummarizer lcs;
- public MetaSummarizer(DocProcessor docProcessor, OpenNLPPOSTagger posTagger) {
+ public MetaSummarizer(DocProcessor docProcessor, NounPOSTagger posTagger) {
dp = docProcessor;
textRank = new TextRankSummarizer(dp);
lcs = new LexicalChainingSummarizer(dp, posTagger);
@@ -75,11 +75,11 @@
}
public List<Score> rankSentences(String article, List<Sentence> sent, int maxWords) {
- List<LexicalChain> lc = lcs.buildLexicalChains(article, sent);
+ List<LexicalChain> lc = lcs.buildLexicalChains(sent);
Collections.sort(lc);
Hashtable<Integer, Score> sentScores = new Hashtable<>();
try {
- List<Score> scores = textRank.rankSentences(article, sent, article.length());
+ List<Score> scores = textRank.rankSentences(sent, article.length());
for (Score s : scores) sentScores.put(s.getSentId(), s);
} catch (Exception ex) {
ex.printStackTrace();
@@ -102,7 +102,7 @@
else {
finalSc.add(sentScores.get(s.getSentId()));
summSents.put(s, true);
- currWordCnt += s.getWordCnt();
+ currWordCnt += s.getWordCount();
break;
}
}
@@ -117,7 +117,7 @@
@Override
public String summarize(String article, int maxWords) {
// Build lexical Chains..
- List<Sentence> sent = dp.getSentencesFromStr(article);
+ List<Sentence> sent = dp.getSentences(article);
List<Score> finalSc = rankSentences(article, sent, maxWords);
StringBuilder sb = new StringBuilder();
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
index c185361..a638d68 100755
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
@@ -17,8 +17,7 @@
package opennlp.summarization.preprocess;
-import java.io.BufferedInputStream;
-import java.io.FileReader;
+import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
@@ -28,8 +27,6 @@
import java.util.ArrayList;
import java.util.Locale;
import java.util.Hashtable;
-import java.util.logging.Level;
-import java.util.logging.Logger;
import java.util.regex.Pattern;
import opennlp.summarization.Sentence;
@@ -38,6 +35,7 @@
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
import opennlp.tools.stemmer.Stemmer;
+import opennlp.tools.util.DownloadUtil;
/**
* Parses a document to sentences.
@@ -53,16 +51,21 @@
private static final int SENTENCE_FRAG = OPEN_NLP;
private final Stemmer stemmer;
- private SentenceModel sentModel;
+ private final SentenceModel sentModel;
- public DefaultDocProcessor(InputStream fragModelFile) {
+ /**
+ * Instantiates a {@link DocProcessor} for a Sentence detection model for the specified {@code languageCode}.
+ *
+ * @param languageCode An ISO-language code for obtaining a {@link SentenceModel}.
+ * Must not be {@code null} and not be blank.
+ * @throws IOException Thrown if IO errors occurred.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public DefaultDocProcessor(String languageCode) throws IOException {
+ if (languageCode == null || languageCode.isBlank())
+ throw new IllegalArgumentException("Parameter 'languageCode' must not be null or blank");
stemmer = new PorterStemmer();
-
- try (InputStream modelIn = new BufferedInputStream(fragModelFile)) {
- sentModel = new SentenceModel(modelIn);
- } catch (Exception ex) {
- Logger.getAnonymousLogger().info("Error while parsing.. Ignoring the line and marching on.. " + ex.getMessage());
- }
+ sentModel = DownloadUtil.downloadModel(languageCode, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
}
// Str - Document or para
@@ -81,8 +84,8 @@
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
String sentence = str.substring(start, end);//str.substring(oldSentEndIdx, sentEndIdx).trim();
- //Add the sentence as-is; do any processing at the word level
- //To lower case and trim all punctuations
+ // Add the sentence as-is; do any processing at the word level
+ // To lower case and trim all punctuations
sentences.add(sentence);
wrdItr.setText(sentence);
StringBuilder procSent = new StringBuilder();
@@ -93,12 +96,12 @@
String word = sentence.substring(wrdStrt, wrdEnd);//words[i].trim();
word = word.replace(REGEX, "");
- //Skip stop words and stem the word
+ // Skip stop words and stem the word
if (sw.isStopWord(word)) continue;
String stemedWrd = stemmer.stem(word).toString();
- //update iidx by adding the current sentence to the list
+ // update iidx by adding the current sentence to the list
if (iidx != null) {
if (stemedWrd.length() > 1) {
List<Integer> sentList = iidx.get(stemedWrd);
@@ -107,7 +110,7 @@
}
sentList.add(sentCnt);
- //Save it back
+ // Save it back
iidx.put(stemedWrd, sentList);
}
}
@@ -121,60 +124,77 @@
}
- public String docToString(String fileName) {
- StringBuilder docBuffer = new StringBuilder();
+ /**
+ * Reads a document's content from a file.
+ *
+ * @param fileName The path relative file reference of the resource to read in.
+ * If {@code null} or empty, an empty String is returned.
+ * @return A string representation of the file's contents.
+ */
+ public String docToString(String fileName) throws IOException {
+ if (fileName == null || fileName.isBlank()) {
+ return "";
+ } else {
+ StringBuilder docBuffer = new StringBuilder();
+ try (InputStream in = DefaultDocProcessor.class.getResourceAsStream(fileName);
+ LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) {
+ String nextLine;
- try (InputStream in = DefaultDocProcessor.class.getResourceAsStream(fileName);
- LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) {
- String nextLine;
-
- while ((nextLine = lnr.readLine()) != null) {
- String trimmedLine = nextLine.trim();
- if (!trimmedLine.isEmpty()) {
- docBuffer.append(REPLACEMENT_PATTERN.matcher(trimmedLine).replaceAll("")).append(" ");
- }
- }
- } catch (Exception ex) {
- Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
- }
- return docBuffer.toString();
- }
-
- //List of sentences form a document
- public List<Sentence> docToSentList(String fileName) {
- List<Sentence> sentList = new ArrayList<>();
-
- try (LineNumberReader lnr = new LineNumberReader(new FileReader(fileName))) {
- String nextLine;
- int paraNo = 0;
- int sentNo = 0;
- while ((nextLine = lnr.readLine()) != null) {
- String trimmedLine = nextLine.trim();
- if (!trimmedLine.isEmpty()) {
- List<String> sents = new ArrayList<>();
- List<String> cleanedSents = new ArrayList<>();
- this.getSentences(trimmedLine, sents, null, cleanedSents);
- int paraPos = 1;
- for (String sen : sents) {
- Sentence s = new Sentence();
- s.setSentId(sentNo++);
- s.setParagraph(paraNo);
- s.setStringVal(sen);
- s.setParaPos(paraPos++);
- sentList.add(s);
+ while ((nextLine = lnr.readLine()) != null) {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty()) {
+ docBuffer.append(REPLACEMENT_PATTERN.matcher(trimmedLine).replaceAll("")).append(" ");
}
- paraNo++;
}
}
-
- } catch (Exception ex) {
- Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ return docBuffer.toString();
}
- return sentList;
}
+ /**
+ * Reads a document's content from a file.
+ *
+ * @param fileName The path relative file reference of the resource to read in.
+ * If {@code null} or empty, an empty List is returned.
+ * @return A list {@link Sentence sentences} representing the file's contents.
+ */
+ public List<Sentence> docToSentences(String fileName) throws IOException {
+ if (fileName == null || fileName.isBlank()) {
+ return Collections.emptyList();
+ } else {
+ List<Sentence> sentList = new ArrayList<>();
+ try (InputStream in = DefaultDocProcessor.class.getResourceAsStream(fileName);
+ LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) {
+ String nextLine;
+ int paraNo = 0;
+ int sentNo = 0;
+ while ((nextLine = lnr.readLine()) != null) {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty()) {
+ List<String> sents = new ArrayList<>();
+ List<String> cleanedSents = new ArrayList<>();
+ this.getSentences(trimmedLine, sents, null, cleanedSents);
+ int paraPos = 1;
+ for (String sen : sents) {
+ Sentence s = new Sentence(sentNo++, sen, paraNo, paraPos++);
+ sentList.add(s);
+ }
+ paraNo++;
+ }
+ }
+ }
+ return sentList;
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
@Override
- public List<Sentence> getSentencesFromStr(String text) {
+ public List<Sentence> getSentences(String text) {
+ if (text == null || text.isBlank()) {
+ return Collections.emptyList();
+ }
List<Sentence> ret = new ArrayList<>();
List<String> sentStrs = new ArrayList<>();
List<String> cleanedSents = new ArrayList<>();
@@ -188,24 +208,28 @@
Collections.addAll(sentStrs, sentences);
}
int sentNo = 0;
-
for (String sen : sentStrs) {
- Sentence s = new Sentence();
- s.setSentId(sentNo);
- s.setParagraph(1);
- s.setStringVal(sen);
- s.setParaPos(sentNo);
+ Sentence s = new Sentence(sentNo, sen, 1, sentNo);
ret.add(s);
sentNo++;
}
return ret;
}
+ /**
+ * {@inheritDoc}
+ */
@Override
public String[] getWords(String sent) {
+ if (sent == null || sent.isBlank()) {
+ return new String[0];
+ }
return sent.trim().split("\\s+");
}
+ /**
+ * {@inheritDoc}
+ */
@Override
public Stemmer getStemmer() {
return stemmer;
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java b/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java
index 8b88cd6..b6eef0b 100755
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java
@@ -17,6 +17,7 @@
package opennlp.summarization.preprocess;
+import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Hashtable;
@@ -29,12 +30,17 @@
* @see WordWeight
*/
public class IDFWordWeight implements WordWeight {
+
private static IDFWordWeight instance;
final Hashtable<String, Double> idf;
public IDFWordWeight(String fileName) {
idf = new Hashtable<>();
- load(fileName);
+ try {
+ load(fileName);
+ } catch (IOException e) {
+ throw new RuntimeException("Could not load the file with IDF", e);
+ }
}
public static IDFWordWeight getInstance(String fileName) {
@@ -58,7 +64,7 @@
* Loads the IDF for words from given file. The file is required to have a simple format -
* word, IDF.
*/
- private void load(String fileName) {
+ private void load(String fileName) throws IOException {
try (InputStream in = IDFWordWeight.class.getResourceAsStream(fileName);
LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) {
@@ -72,9 +78,6 @@
idf.put(word, idfVal);
}
}
- } catch (Exception ex) {
- System.err.println("Could not load the file with IDF");
- ex.printStackTrace();
}
}
}
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java b/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java
index deb338d..c558dee 100755
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java
@@ -215,7 +215,7 @@
h.add("your");
h.add("yours");
h.add("yourself");
- h.add("yourselves ");
+ h.add("yourselves");
}
public static StopWords getInstance() {
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java b/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java
index 97866aa..1998434 100755
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java
@@ -17,7 +17,14 @@
package opennlp.summarization.preprocess;
+/**
+ * Represents a type which can compute the weight of a word in a certain context, e.g. a sentence or a text.
+ */
public interface WordWeight {
- double getWordWeight(String s);
+ /**
+ * @param token The input token (word) to get a weight for. Must not be {@code null}.
+ * @return The associated weight for the specified {@code token}.
+ */
+ double getWordWeight(String token);
}
diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
index 3ead306..9b29f38 100755
--- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
+++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
@@ -30,46 +30,74 @@
import opennlp.summarization.preprocess.WordWeight;
/**
- * Implements the TextRank algorithm by Mihalcea et al.
- * <p>
+ * Implements the TextRank algorithm by Rada Mihalcea and Paul Tarau: <br/>
+ * <a href="https://aclanthology.org/W04-3252/">TextRank: Bringing Order into Text</a>
+ * <br/><br/>
* This basically applies the page rank algorithm to a graph where each sentence is a node
* and a connection between sentences indicates that a word is shared between them.
+ * <p>
* It returns a ranking of sentences where the highest rank means most important etc.
* Currently, only stemming is done to the words; a more sophisticated way might use a
* resource like Wordnet to match synonyms etc.
*/
public class TextRank {
+
private static final int NO_OF_IT = 100;
// DAMPING FACTOR..
private static final double DF = 0.15;
private static final boolean HIGHER_TITLE_WEIGHT = true;
private static final double TITLE_WRD_WT = 2d;
+
+ private final DocProcessor docProc;
private final StopWords sw;
private final WordWeight wordWt;
+
private final double maxErr = 0.1;
private final double title_wt = 0;
- private String article;
- private Hashtable<Integer, List<Integer>> links;
+
+ private Hashtable<Integer, List<Integer>> links = new Hashtable<>();
private List<String> sentences = new ArrayList<>();
private List<String> processedSent = new ArrayList<>();
- private DocProcessor docProc;
+ /**
+ * Instantiates a {@link TextRank} with the specified {@link DocProcessor}.
+ *
+ * @param dp A valid {@link DocProcessor}. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public TextRank(DocProcessor dp) {
- sw = new StopWords();
- setLinks(new Hashtable<>());
- processedSent = new ArrayList<>();
- docProc = dp;
- wordWt = IDFWordWeight.getInstance("/meta/idf.csv");
+ this(dp, new StopWords(), IDFWordWeight.getInstance("/idf.csv"));
}
- public TextRank(StopWords sw, WordWeight wordWts) {
- this.sw = sw;
- this.wordWt = wordWts;
+ /**
+ * Instantiates a {@link TextRank} with the specified {@link DocProcessor}.
+ *
+ * @param dp A valid {@link DocProcessor}. Must not be {@code null}.
+ * @param stopWords The {@link StopWords} instance to use. Must not be {@code null}.
+ * @param wordWeights The {@link WordWeight} instance to use. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public TextRank(DocProcessor dp, StopWords stopWords, WordWeight wordWeights) {
+ if (dp == null) throw new IllegalArgumentException("parameter 'dp' must not be null");
+ if (stopWords == null) throw new IllegalArgumentException("parameter 'stopWords' must not be null");
+ if (wordWeights == null) throw new IllegalArgumentException("parameter 'wordWeights' must not be null");
+ this.docProc = dp;
+ this.sw = stopWords;
+ this.wordWt = wordWeights;
}
- // Returns similarity of two sentences. Wrd wts contains tf-idf of the words..
- public double getWeightedSimilarity(String sent1, String sent2,
- Hashtable<String, Double> wrdWts) {
+ /**
+ * Computes the similarity of two sentences.
+ *
+ * @param sent1 The first sentence. If {@code null} or empty the computation will result in {@code 0.0}.
+ * @param sent2 The second sentence. If {@code null} or empty the computation will result in {@code 0.0}.
+ * @param wrdWts The mapping table contains tf-idf of the words.
+ * @return The computed similarity. If no similarity exist, the resulting value equals {@code 0.0}.
+ */
+ public double getWeightedSimilarity(String sent1, String sent2, Hashtable<String, Double> wrdWts) {
+
String[] words1 = docProc.getWords(sent1);
String[] words2 = docProc.getWords(sent2);
double wordsInCommon = 0;
@@ -97,13 +125,17 @@
return (wordsInCommon) / (words1.length + words2.length);
}
- // Gets the current score from the list of scores passed ...
+ /**
+ * @param scores A list of {@link Score} instances.
+ * @param id The sentence id to check for.
+ * @return Gets the element from {@code scores} that matches the passed sentence {@code id}.
+ */
public double getScoreFrom(List<Score> scores, int id) {
for (Score s : scores) {
if (s.getSentId() == id)
return s.getScore();
}
- return 1;
+ return 1; // Why is the default score "1" here?
}
// This method runs the page rank algorithm for the sentences.
@@ -114,9 +146,7 @@
List<Score> currWtScores = new ArrayList<>();
// Start with equal weights for all sentences
for (int i = 0; i < rawScores.size(); i++) {
- Score ns = new Score();
- ns.setSentId(rawScores.get(i).getSentId());
- ns.setScore((1 - title_wt) / (rawScores.size()));// this.getSimilarity();
+ Score ns = new Score(rawScores.get(i).getSentId(), (1 - title_wt) / (rawScores.size())); // this.getSimilarity();
currWtScores.add(ns);
}
// currWtScores.get(0).score = this.title_wt;
@@ -129,8 +159,6 @@
// Update the scores for the current iteration..
for (Score rs : rawScores) {
int sentId = rs.getSentId();
- Score ns = new Score();
- ns.setSentId(sentId);
List<Integer> neighbors = getLinks().get(sentId);
double sum = 0;
@@ -145,7 +173,7 @@
sum += wij / sigmawjk * txtRnkj;
}
}
- ns.setScore((1d - DF) + sum * DF);// * rs.score
+ Score ns = new Score(sentId, (1d - DF) + sum * DF); // * rs.score
totErr += ns.getScore() - getScoreFrom(rawScores, sentId);
newWtScores.add(ns);
}
@@ -169,8 +197,7 @@
for (int i = 0; i < sentences.size(); i++) {
String nextSent = sentences.get(i);
String[] words = docProc.getWords(nextSent);
- Score s = new Score();
- s.setSentId(i);
+ Score s = new Score(i, 0d);
for (String word : words) {
String currWrd = docProc.getStemmer().stem(word).toString(); //stemmer.toString();
@@ -215,12 +242,12 @@
return wrdWt;
}
- public List<Score> getRankedSentences(String doc, List<String> sentences,
+ public List<Score> getRankedSentences(List<String> sentences,
Hashtable<String, List<Integer>> iidx, List<String> processedSent) {
this.sentences = sentences;
this.processedSent = processedSent;
- Hashtable<String, Double> wrdWts = toWordWtHashtable(this.wordWt, iidx);// new
+ Hashtable<String, Double> wrdWts = toWordWtHashtable(this.wordWt, iidx); // new
if (HIGHER_TITLE_WEIGHT && !getSentences().isEmpty()) {
String sent = getSentences().get(0);
@@ -250,14 +277,6 @@
this.sentences = sentences;
}
- public String getArticle() {
- return article;
- }
-
- public void setArticle(String article) {
- this.article = article;
- }
-
public Hashtable<Integer, List<Integer>> getLinks() {
return links;
}
@@ -265,14 +284,5 @@
private void setLinks(Hashtable<Integer, List<Integer>> links) {
this.links = links;
}
-}
-/*
- * public double getScore(String sent1, String sent2, boolean toPrint) {
- * String[] words1 = sent1.split("\\s+"); String[] words2 = sent2.split("\\s+");
- * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
- * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
- * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+=
- * wordWt.getWordWeight(words1[i]); } } } return ((double)wordsInCommon) /
- * (Math.log(1+words1.length) + Math.log(1+words2.length)); }
- */
\ No newline at end of file
+}
diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
index 765bb94..8048e5d 100755
--- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
+++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
@@ -17,7 +17,6 @@
package opennlp.summarization.textrank;
-
import opennlp.summarization.DocProcessor;
import opennlp.summarization.Score;
import opennlp.summarization.Sentence;
@@ -49,67 +48,60 @@
}
/* Sets up data and calls the TextRank algorithm..*/
- public List<Score> rankSentences(String doc, List<Sentence> sentences, int maxWords) {
- try {
- //Rank sentences
- TextRank summ = new TextRank(docProcessor);
- List<String> sentenceStrL = new ArrayList<>();
- List<String> processedSent = new ArrayList<>();
- Hashtable<String, List<Integer>> iidx = new Hashtable<>();
+ public List<Score> rankSentences(List<Sentence> sentences, int maxWords) {
+ final TextRank summ = new TextRank(docProcessor);
+ final List<String> sentenceStrL = new ArrayList<>();
+ final List<String> processedSent = new ArrayList<>();
+ final Hashtable<String, List<Integer>> iidx = new Hashtable<>();
- for (Sentence s : sentences) {
- sentenceStrL.add(s.getStringVal());
- String stemmedSent = s.stem();
- processedSent.add(stemmedSent);
+ //Rank sentences
+ for (Sentence s : sentences) {
+ sentenceStrL.add(s.getStringVal());
+ String stemmedSent = s.stem();
+ processedSent.add(stemmedSent);
- String[] wrds = stemmedSent.split(" ");
- for (String w : wrds) {
- if (iidx.get(w) != null)
- iidx.get(w).add(s.getSentId());
- else {
- List<Integer> l = new ArrayList<>();
- l.add(s.getSentId());
- iidx.put(w, l);
- }
+ String[] wrds = stemmedSent.split("\\s+");
+ for (String w : wrds) {
+ if (iidx.get(w) != null)
+ iidx.get(w).add(s.getSentId());
+ else {
+ List<Integer> l = new ArrayList<>();
+ l.add(s.getSentId());
+ iidx.put(w, l);
}
}
-
- List<Score> finalScores = summ.getRankedSentences(doc, sentenceStrL, iidx, processedSent);
-
- // SentenceClusterer clust = new SentenceClusterer();
- // clust.runClusterer(doc, summ.processedSent);
-
- Hashtable<Integer, List<Integer>> links = summ.getLinks();
-
- for (int i = 0; i < sentences.size(); i++) {
- Sentence st = sentences.get(i);
-
- //Add links..
- List<Integer> currLnks = links.get(i);
- if (currLnks == null) continue;
- for (int j = 0; j < currLnks.size(); j++) {
- if (j < i) st.addLink(sentences.get(j));
- }
- }
-
- for (Score s : finalScores) {
- Sentence st = sentences.get(s.getSentId());
- st.setPageRankScore(s);
- }
-
- List<Score> reRank = finalScores; //reRank(sentences, finalScores, iidx, wordWt, maxWords);
-
- return reRank;
- } catch (Exception e) {
- e.printStackTrace();
}
- return null;
+
+ List<Score> finalScores = summ.getRankedSentences(sentenceStrL, iidx, processedSent);
+
+ // SentenceClusterer clust = new SentenceClusterer();
+ // clust.runClusterer(doc, summ.processedSent);
+
+ Hashtable<Integer, List<Integer>> links = summ.getLinks();
+
+ for (int i = 0; i < sentences.size(); i++) {
+ Sentence st = sentences.get(i);
+
+ // Add links..
+ List<Integer> currLnks = links.get(i);
+ if (currLnks == null) continue;
+ for (int j = 0; j < currLnks.size(); j++) {
+ if (j < i) st.addLink(sentences.get(j));
+ }
+ }
+
+ for (Score s : finalScores) {
+ Sentence st = sentences.get(s.getSentId());
+ st.setPageRankScore(s);
+ }
+
+ return finalScores; //reRank(sentences, finalScores, iidx, wordWt, maxWords);
}
@Override
public String summarize(String article, int maxWords) {
- List<Sentence> sentences = docProcessor.getSentencesFromStr(article);
- List<Score> scores = rankSentences(article, sentences, maxWords);
+ List<Sentence> sentences = docProcessor.getSentences(article);
+ List<Score> scores = rankSentences(sentences, maxWords);
return scores2String(sentences, scores, maxWords);
}
diff --git a/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java b/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java
index ce7bc50..ec31f79 100644
--- a/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java
+++ b/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java
@@ -17,7 +17,7 @@
package opennlp.summarization;
-import opennlp.summarization.lexicalchaining.OpenNLPPOSTagger;
+import opennlp.summarization.lexicalchaining.NounPOSTagger;
import opennlp.summarization.preprocess.DefaultDocProcessor;
import org.junit.jupiter.api.BeforeAll;
@@ -37,12 +37,12 @@
private static final Logger log = LoggerFactory.getLogger(AbstractSummarizerTest.class);
protected static DefaultDocProcessor docProcessor;
- protected static OpenNLPPOSTagger posTagger;
+ protected static NounPOSTagger posTagger;
@BeforeAll
static void initEnv() throws IOException {
- docProcessor = new DefaultDocProcessor(AbstractSummarizerTest.class.getResourceAsStream("/en-sent.bin"));
- posTagger = new OpenNLPPOSTagger(docProcessor, AbstractSummarizerTest.class.getResourceAsStream("/en-pos-maxent.bin"));
+ docProcessor = new DefaultDocProcessor("en");
+ posTagger = new NounPOSTagger("en");
}
/**
@@ -52,17 +52,17 @@
@ParameterizedTest(name = "news story {index}")
@ValueSource(strings = {
- "/meta/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story",
- "/meta/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story",
- "/meta/0a3040b6c1bba95efca727158f128a19c44ec8ba.story",
- "/meta/0a3479b53796863a664c32ca20d8672583335d2a.story",
- "/meta/0a3639cb86487e72e2ba084211f99799918aedf8.story",
- "/meta/0a4092bef1801863296777ebcfeceb1aec23c78f.story",
- "/meta/0a5458d3427b290524a8df11d8503a5b57b32747.story",
- "/meta/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story",
- "/meta/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story"
+ "/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story",
+ "/news/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story",
+ "/news/0a3040b6c1bba95efca727158f128a19c44ec8ba.story",
+ "/news/0a3479b53796863a664c32ca20d8672583335d2a.story",
+ "/news/0a3639cb86487e72e2ba084211f99799918aedf8.story",
+ "/news/0a4092bef1801863296777ebcfeceb1aec23c78f.story",
+ "/news/0a5458d3427b290524a8df11d8503a5b57b32747.story",
+ "/news/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story",
+ "/news/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story"
})
- public void testSummarize(String filename) {
+ public void testSummarize(String filename) throws IOException {
String article = docProcessor.docToString(filename);
String summary = getSummarizer().summarize(article, 20);
assertNotNull(summary);
diff --git a/summarizer/src/test/java/opennlp/summarization/SentenceTest.java b/summarizer/src/test/java/opennlp/summarization/SentenceTest.java
new file mode 100644
index 0000000..8eaed89
--- /dev/null
+++ b/summarizer/src/test/java/opennlp/summarization/SentenceTest.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.summarization;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.NullAndEmptySource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class SentenceTest {
+
+ private static final String SENTENCE = "This example is available in many tests.";
+
+ // SUT
+ private Sentence sentence;
+
+ @BeforeEach
+ public void setUp() {
+ sentence = new Sentence(0, SENTENCE, 0, 0);
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"\t", "\n", " "})
+ @NullAndEmptySource
+ public void testConstructInvalid1(String input) {
+ assertThrows(IllegalArgumentException.class, () -> new Sentence(0, input, 0, 0));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {Integer.MIN_VALUE, -42, -1})
+ public void testConstructInvalid2(int input) {
+ assertThrows(IllegalArgumentException.class, () -> new Sentence(input, SENTENCE, 0, 0));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {Integer.MIN_VALUE, -42, -1})
+ public void testConstructInvalid3(int input) {
+ assertThrows(IllegalArgumentException.class, () -> new Sentence(0, SENTENCE, input, 0));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {Integer.MIN_VALUE, -42, -1})
+ public void testConstructInvalid4(int input) {
+ assertThrows(IllegalArgumentException.class, () -> new Sentence(0, SENTENCE, 0, input));
+ }
+
+ @Test
+ public void testSentenceIdentity() {
+ assertEquals(0, sentence.getSentId());
+ assertEquals(0, sentence.getParagraph());
+ assertEquals(0, sentence.getParaPos());
+ assertEquals(SENTENCE, sentence.getStringVal());
+ }
+
+ @Test
+ public void testStem() {
+ String stemmed = sentence.stem();
+ assertNotNull(stemmed);
+ assertFalse(stemmed.isBlank());
+ assertEquals("Thi exampl avail mani test ", stemmed);
+ }
+
+ @Test
+ public void testGetWrdCnt() {
+ int wordCountWithoutStopwords = sentence.getWordCount();
+ assertEquals(5, wordCountWithoutStopwords);
+ }
+
+ @Test
+ public void testHashcode() {
+ int hash = sentence.hashCode();
+ assertEquals(hash, new Sentence(0, SENTENCE, 0, 0).hashCode());
+ }
+
+ @Test
+ public void testEquals() {
+ assertEquals(sentence, new Sentence(0, SENTENCE, 0, 0));
+ }
+
+ @Test
+ public void testToString() {
+ assertEquals(sentence.toString(), new Sentence(0, SENTENCE, 0, 0).toString());
+ }
+}
diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/AbstractLexicalChainTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/AbstractLexicalChainTest.java
new file mode 100644
index 0000000..b2bca3c
--- /dev/null
+++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/AbstractLexicalChainTest.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.summarization.lexicalchaining;
+
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import org.junit.jupiter.api.BeforeAll;
+
+public abstract class AbstractLexicalChainTest {
+
+ protected static final String ARTICLE =
+ "US President Barack Obama has welcomed an agreement between the US and Russia under which Syria's chemical weapons must be destroyed or removed by mid-2014 as an \"important step\"."
+ + "But a White House statement cautioned that the US expected Syria to live up to its public commitments. "
+ + "The US-Russian framework document stipulates that Syria must provide details of its stockpile within a week. "
+ + "If Syria fails to comply, the deal could be enforced by a UN resolution. "
+ + "China, France, the UK, the UN and Nato have all expressed satisfaction at the agreement. "
+ + "In Beijing, Foreign Minister Wang Yi said on Sunday that China welcomes the general agreement between the US and Russia.";
+
+ protected static DefaultDocProcessor dp;
+ protected static LexicalChainingSummarizer lcs;
+
+ @BeforeAll
+ static void initEnv() throws Exception {
+ dp = new DefaultDocProcessor("en");
+ lcs = new LexicalChainingSummarizer(dp, "en");
+ }
+}
diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainTest.java
deleted file mode 100644
index 8655922..0000000
--- a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainTest.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.summarization.lexicalchaining;
-
-import opennlp.summarization.Sentence;
-import opennlp.summarization.preprocess.DefaultDocProcessor;
-
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import java.util.Collections;
-import java.util.Hashtable;
-import java.util.List;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
-
-class LexChainTest {
-
- private static final String ARTICLE =
- "US President Barack Obama has welcomed an agreement between the US and Russia under which Syria's chemical weapons must be destroyed or removed by mid-2014 as an \"important step\"."
- + "But a White House statement cautioned that the US expected Syria to live up to its public commitments. "
- + "The US-Russian framework document stipulates that Syria must provide details of its stockpile within a week. "
- + "If Syria fails to comply, the deal could be enforced by a UN resolution. "
- + "China, France, the UK, the UN and Nato have all expressed satisfaction at the agreement. "
- + "In Beijing, Foreign Minister Wang Yi said on Sunday that China welcomes the general agreement between the US and Russia.";
-
- private static DefaultDocProcessor dp;
- private static LexicalChainingSummarizer lcs;
-
- @BeforeAll
- static void initEnv() throws Exception {
- dp = new DefaultDocProcessor(LexChainTest.class.getResourceAsStream("/en-sent.bin"));
- lcs = new LexicalChainingSummarizer(dp, LexChainTest.class.getResourceAsStream("/en-pos-maxent.bin"));
- }
-
- @Test
- void testBuildLexicalChains() {
- List<Sentence> sent = dp.getSentencesFromStr(ARTICLE);
- assertNotNull(sent);
- List<LexicalChain> vh = lcs.buildLexicalChains(ARTICLE, sent);
- assertNotNull(vh);
- Collections.sort(vh);
- assertTrue(!vh.isEmpty());
-
- List<Sentence> s = dp.getSentencesFromStr(ARTICLE);
- Hashtable<String, Boolean> comp = new Hashtable<>();
-
- for (int i = vh.size() - 1; i >= Math.max(vh.size() - 50, 0); i--) {
- LexicalChain lc = vh.get(i);
-
- if (!(comp.containsKey(lc.getWord().get(0).getLexicon()))) {
- comp.put(lc.getWord().get(0).getLexicon(), Boolean.TRUE);
- /*
- for(int j=0;j<lc.getWord().size();j++)
- System.out.print(lc.getWord().get(j) + " -- ");
- */
-
- assertEquals(1.0d, lc.score());
- /*
- for(Sentence sid : lc.getSentences()) {
- //if(sid>=0 && sid<s.size())
- System.out.println(sid);
- }
- */
- }
- }
-
- }
-
- @Test
- void testGetRelation() {
- try {
- WordRelationshipDetermination lcs = new WordRelationshipDetermination();
- LexicalChain l = new LexicalChain();
- List<Word> words = lcs.getWordSenses("music");
-
- l.addWord(words.get(0));
- // int rel = lcs.getRelation(l, "nation");
- WordRelation rel2 = lcs.getRelation(l, "tune", true);
- WordRelation rel3 = lcs.getRelation(l, "vocal", true);
- assertEquals(1, rel2.relation());
- assertEquals(1, rel3.relation());
- // assertEquals(rel, LexicalChainingSummarizer.STRONG_RELATION);
- assertEquals(WordRelation.MED_RELATION, rel2.relation());
- assertEquals(WordRelation.MED_RELATION, rel3.relation());
- } catch (Exception e) {
- fail(e.getLocalizedMessage());
- }
- }
-
-}
diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java
index 1bb476a..4b38793 100644
--- a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java
+++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java
@@ -17,43 +17,69 @@
package opennlp.summarization.lexicalchaining;
+import java.util.Collections;
import java.util.List;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
import opennlp.summarization.Sentence;
-import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-class LexChainingKeywordExtractorTest {
+class LexChainingKeywordExtractorTest extends AbstractLexicalChainTest {
- private static final String ARTICLE =
- "US President Barack Obama has welcomed an agreement between the US and Russia under which Syria's chemical weapons must be destroyed or removed by mid-2014 as an \"important step\"."
- + "But a White House statement cautioned that the US expected Syria to live up to its public commitments. "
- + "The US-Russian framework document stipulates that Syria must provide details of its stockpile within a week. "
- + "If Syria fails to comply, the deal could be enforced by a UN resolution. "
- + "China, France, the UK, the UN and Nato have all expressed satisfaction at the agreement. "
- + "In Beijing, Foreign Minister Wang Yi said on Sunday that China welcomes the general agreement between the US and Russia.";
+ private static List<LexicalChain> chains;
- private static DefaultDocProcessor dp;
- private static LexicalChainingSummarizer lcs;
+ // SUT
+ private LexicalChainingKeywordExtractor keywordExtractor;
@BeforeAll
static void initEnv() throws Exception {
- dp = new DefaultDocProcessor(LexChainingKeywordExtractorTest.class.getResourceAsStream("/en-sent.bin"));
- lcs = new LexicalChainingSummarizer(dp, LexChainingKeywordExtractorTest.class.getResourceAsStream("/en-pos-maxent.bin"));
+ AbstractLexicalChainTest.initEnv();
+ // Prep
+ List<Sentence> sent = dp.getSentences(ARTICLE);
+ assertNotNull(sent);
+ assertFalse(sent.isEmpty());
+ chains = lcs.buildLexicalChains(sent);
+ assertNotNull(chains);
+ assertFalse(chains.isEmpty());
+ }
+
+ @BeforeEach
+ public void setUp() {
+ keywordExtractor = new LexicalChainingKeywordExtractor();
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {1, 5, 42, Integer.MAX_VALUE})
+ void testExtractKeywords(int noOfKeywords) {
+ List<String> keywords = keywordExtractor.extractKeywords(chains, noOfKeywords);
+ assertNotNull(keywords);
+ assertFalse(keywords.isEmpty());
}
@Test
- void testGetKeywords() {
- List<Sentence> sent = dp.getSentencesFromStr(ARTICLE);
- List<LexicalChain> vh = lcs.buildLexicalChains(ARTICLE, sent);
- LexChainingKeywordExtractor ke = new LexChainingKeywordExtractor();
- List<String> keywords = ke.getKeywords(vh, 5);
+ void testExtractKeywordsWithEmptyInput() {
+ List<String> keywords = keywordExtractor.extractKeywords(Collections.emptyList(), 5);
assertNotNull(keywords);
- assertFalse(keywords.isEmpty());
+ assertTrue(keywords.isEmpty());
+ }
+
+ @Test
+ void testExtractKeywordsInvalid1() {
+ assertThrows(IllegalArgumentException.class, () -> keywordExtractor.extractKeywords(null, 5));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {Integer.MIN_VALUE, -1, 0})
+ void testExtractKeywordsInvalid2(int noOfKeywords) {
+ assertThrows(IllegalArgumentException.class, () -> keywordExtractor.extractKeywords(chains, noOfKeywords));
}
}
diff --git a/summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java
similarity index 63%
copy from summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerTest.java
copy to summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java
index 2a80782..8826802 100644
--- a/summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerTest.java
+++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java
@@ -15,29 +15,32 @@
* limitations under the License.
*/
-package opennlp.summarization.meta;
+package opennlp.summarization.lexicalchaining;
import opennlp.summarization.AbstractSummarizerTest;
import opennlp.summarization.Summarizer;
import org.junit.jupiter.api.BeforeEach;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
/**
- * Tests the implementation of {@link MetaSummarizer}.
+ * Tests the implementation of {@link LexicalChainingSummarizer} via a small news texts sample.
*/
-public class MetaSummarizerTest extends AbstractSummarizerTest {
+public class LexicalChainingSummarizerNewsTest extends AbstractSummarizerTest {
// SUT
- private Summarizer metaSummarizer;
+ private LexicalChainingSummarizer lexicalChainSummarizer;
@BeforeEach
void setUp() {
- metaSummarizer = new MetaSummarizer(docProcessor, posTagger);
+ lexicalChainSummarizer = new LexicalChainingSummarizer(docProcessor, posTagger);
}
@Override
public Summarizer getSummarizer() {
- return metaSummarizer;
+ return lexicalChainSummarizer;
}
-
}
diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java
index 5d23bef..1f43f44 100644
--- a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java
+++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java
@@ -17,27 +17,58 @@
package opennlp.summarization.lexicalchaining;
-import opennlp.summarization.AbstractSummarizerTest;
-import opennlp.summarization.Summarizer;
+import java.util.Collections;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
-/**
- * Tests the implementation of {@link LexicalChainingSummarizer}.
- */
-public class LexicalChainingSummarizerTest extends AbstractSummarizerTest {
+import opennlp.summarization.Sentence;
- // SUT
- private Summarizer lexicalChainSummarizer;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+class LexicalChainingSummarizerTest extends AbstractLexicalChainTest {
+
+ private List<Sentence> sent;
@BeforeEach
void setUp() {
- lexicalChainSummarizer = new LexicalChainingSummarizer(docProcessor, posTagger);
+ sent = dp.getSentences(ARTICLE);
+ assertNotNull(sent);
}
- @Override
- public Summarizer getSummarizer() {
- return lexicalChainSummarizer;
+ @Test
+ void testBuildLexicalChains() {
+ List<LexicalChain> vh = lcs.buildLexicalChains(sent);
+ assertNotNull(vh);
+ Collections.sort(vh);
+ assertFalse(vh.isEmpty());
+
+ Map<String, Boolean> comp = new Hashtable<>();
+
+ for (int i = vh.size() - 1; i >= Math.max(vh.size() - 50, 0); i--) {
+ LexicalChain lc = vh.get(i);
+ Word w = lc.getWords().get(0);
+ if (!(comp.containsKey(w.getLexicon()))) {
+ comp.put(w.getLexicon(), Boolean.TRUE);
+ /*
+ for(int j=0;j<lc.getWord().size();j++)
+ System.out.print(lc.getWord().get(j) + " -- ");
+ */
+
+ // assertEquals(1.0d, lc.score());
+ /*
+ System.out.println(lc + ": ");
+ for(Sentence sid : lc.getSentences()) {
+ //if(sid>=0 && sid<s.size())
+ System.out.println("\t" + sid + " [" + lc.score() + "]");
+ }
+ */
+ }
+ }
}
}
diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/NounPOSTaggerTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/NounPOSTaggerTest.java
new file mode 100644
index 0000000..246b37d
--- /dev/null
+++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/NounPOSTaggerTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.summarization.lexicalchaining;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EmptySource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import java.io.IOException;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+/**
+ * Tests the {@link POSTagger} implementation {@link NounPOSTagger}.
+ */
+public class NounPOSTaggerTest {
+
+ private static final String UNTAGGED_SENTENCE = "This is a test .";
+ private static final String[] TOKENS_SENTENCE = {"This", "is", "a", "test", "."};
+ private static final String[] TOKENS_TAGGED_SENTENCE = {"This/PRON", "is/AUX", "a/DET", "test/NOUN", "./PUNCT"};
+
+ private static POSTagger tagger; // SUT
+
+ @BeforeAll
+ public static void initResources() throws IOException {
+ tagger = new NounPOSTagger("en");
+ }
+
+ @Test
+ void testConstructWithInvalidResource() {
+ assertThrows(IllegalArgumentException.class, () -> new NounPOSTagger(null));
+ }
+
+ @Test
+ void testGetTaggedString() {
+ String tagged = tagger.getTaggedString(UNTAGGED_SENTENCE);
+ assertNotNull(tagged);
+ assertEquals("This/PRON is/AUX a/DET test/NOUN ./PUNCT", tagged);
+ }
+
+ @Test
+ void testGetTaggedStringInvalid1() {
+ assertThrows(IllegalArgumentException.class, () -> tagger.getTaggedString(null));
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"\t", "\n", " "})
+ @EmptySource
+ void testGetTaggedStringInvalid2(String input) {
+ String tagged = tagger.getTaggedString(input);
+ assertNotNull(tagged);
+ }
+
+ @Test
+ void testGetWordsOfTypeWithTags() {
+ List<String> filteredByType = tagger.getWordsOfType(TOKENS_TAGGED_SENTENCE, POSTagger.NOUN);
+ assertNotNull(filteredByType);
+ assertEquals(1, filteredByType.size());
+ assertEquals("test", filteredByType.get(0));
+ }
+
+ @Test
+ void testGetWordsOfTypeWithoutTags() {
+ assertThrows(IllegalArgumentException.class, () ->
+ tagger.getWordsOfType(TOKENS_SENTENCE, POSTagger.NOUN));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {POSTagger.ADJECTIVE, POSTagger.ADVERB, POSTagger.VERB})
+ void testGetWordsOfTypeWithNonMatchingType(int type) {
+ List<String> filteredByType = tagger.getWordsOfType(TOKENS_TAGGED_SENTENCE, type);
+ assertNotNull(filteredByType);
+ assertEquals(0, filteredByType.size());
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {Integer.MIN_VALUE, -1, 5, Integer.MAX_VALUE})
+ void testGetWordsOfTypeWithInvalidType(int type) {
+ assertThrows(IllegalArgumentException.class, () ->
+ tagger.getWordsOfType(TOKENS_TAGGED_SENTENCE, type));
+ }
+
+ @Test
+ void testGetWordsOfTypeWithInvalidInput() {
+ assertThrows(IllegalArgumentException.class, () ->
+ tagger.getWordsOfType(null, POSTagger.NOUN));
+ }
+}
diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordRelationshipDeterminationTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordRelationshipDeterminationTest.java
new file mode 100644
index 0000000..bd8845f
--- /dev/null
+++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordRelationshipDeterminationTest.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.summarization.lexicalchaining;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+public class WordRelationshipDeterminationTest {
+
+ // SUT
+ private WordRelationshipDetermination wrd;
+
+ @BeforeEach
+ public void setUp() {
+ wrd = new WordRelationshipDetermination();
+ }
+
+ @Test
+ void testGetWordSenses() {
+ LexicalChain l = new LexicalChain();
+ List<Word> words = wrd.getWordSenses("music");
+ assertNotNull(words);
+ assertFalse(words.isEmpty());
+ l.addWord(words.get(0));
+ }
+
+ @Test
+ void testGetRelation() {
+ LexicalChain l = new LexicalChain();
+ List<Word> words = wrd.getWordSenses("music");
+ assertNotNull(words);
+ assertFalse(words.isEmpty());
+ l.addWord(words.get(0));
+ // int rel = lcs.getRelation(l, "nation");
+ WordRelation rel2 = wrd.getRelation(l, "tune", true);
+ WordRelation rel3 = wrd.getRelation(l, "vocal", true);
+ assertEquals(1, rel2.relation());
+ assertEquals(1, rel3.relation());
+ // assertEquals(rel, LexicalChainingSummarizer.STRONG_RELATION);
+ assertEquals(WordRelation.MED_RELATION, rel2.relation());
+ assertEquals(WordRelation.MED_RELATION, rel3.relation());
+ }
+}
diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordnetWordTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordnetWordTest.java
new file mode 100644
index 0000000..ab25c21
--- /dev/null
+++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordnetWordTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.summarization.lexicalchaining;
+
+import edu.mit.jwi.item.ISynsetID;
+import edu.mit.jwi.item.IWordID;
+import edu.mit.jwi.item.POS;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.NullAndEmptySource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class WordnetWordTest {
+
+ private WordRelationshipDetermination wrd;
+
+ // SUT
+ private Word word;
+
+ @BeforeEach
+ public void setUp() {
+ wrd = new WordRelationshipDetermination();
+ List<Word> words = wrd.getWordSenses("music");
+ assertNotNull(words);
+ assertFalse(words.isEmpty());
+ word = words.get(0);
+ assertNotNull(word);
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"\t", "\n", " "})
+ @NullAndEmptySource
+ public void testConstructInvalid1(String input) {
+ assertThrows(IllegalArgumentException.class, () -> new WordnetWord(input, new DummyWordID()));
+ }
+
+ @Test
+ public void testConstructInvalid2() {
+ assertThrows(IllegalArgumentException.class, () -> new WordnetWord("music", null));
+ }
+
+ @Test
+ public void testSentenceIdentity() {
+ assertEquals("music", word.getLexicon());
+ assertEquals("WID-07034009-N-01-music", word.getID().toString());
+ }
+
+ @Test
+ public void testHashcode() {
+ int hash = word.hashCode();
+ assertEquals(hash, wrd.getWordSenses("music").get(0).hashCode());
+ }
+
+ @Test
+ public void testEquals() {
+ assertEquals(word, wrd.getWordSenses("music").get(0));
+ }
+
+ @Test
+ public void testToString() {
+ assertEquals(word.toString(), wrd.getWordSenses("music").get(0).toString());
+ }
+
+ private static class DummyWordID implements IWordID {
+ @Override
+ public ISynsetID getSynsetID() {
+ return null;
+ }
+
+ @Override
+ public int getWordNumber() {
+ return 0;
+ }
+
+ @Override
+ public String getLemma() {
+ return "";
+ }
+
+ @Override
+ public POS getPOS() {
+ return null;
+ }
+ }
+}
diff --git a/summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerTest.java b/summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerNewsTest.java
similarity index 88%
rename from summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerTest.java
rename to summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerNewsTest.java
index 2a80782..fd6852d 100644
--- a/summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerTest.java
+++ b/summarizer/src/test/java/opennlp/summarization/meta/MetaSummarizerNewsTest.java
@@ -23,9 +23,9 @@
import org.junit.jupiter.api.BeforeEach;
/**
- * Tests the implementation of {@link MetaSummarizer}.
+ * Tests the implementation of {@link MetaSummarizer} via a small news texts sample.
*/
-public class MetaSummarizerTest extends AbstractSummarizerTest {
+public class MetaSummarizerNewsTest extends AbstractSummarizerTest {
// SUT
private Summarizer metaSummarizer;
diff --git a/summarizer/src/test/java/opennlp/summarization/preprocess/DefaultDocProcessorTest.java b/summarizer/src/test/java/opennlp/summarization/preprocess/DefaultDocProcessorTest.java
new file mode 100644
index 0000000..6814854
--- /dev/null
+++ b/summarizer/src/test/java/opennlp/summarization/preprocess/DefaultDocProcessorTest.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.summarization.preprocess;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import opennlp.summarization.Sentence;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.NullAndEmptySource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class DefaultDocProcessorTest {
+
+ private static DefaultDocProcessor dp;
+
+ @BeforeAll
+ static void initEnv() throws IOException {
+ dp = new DefaultDocProcessor("en");
+ }
+
+ @Test
+ void testGetSentences() {
+ String sent = "This is a sentence, with some punctuations; to test if the sentence breaker can handle it! Is every thing working OK ? Yes.";
+ List<Sentence> doc = dp.getSentences(sent);
+ assertNotNull(doc);
+ assertEquals(3, doc.size());
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"\t", "\n", " "})
+ @NullAndEmptySource
+ void testGetSentencesInvalid(String input) {
+ List<Sentence> doc = dp.getSentences(input);
+ assertNotNull(doc);
+ assertEquals(0, doc.size());
+ }
+
+ @Test
+ void testGetWords() {
+ String sent = "This is a sentence, with some punctuations; to test if the sentence breaker can handle it! Is every thing working OK ? Yes.";
+ List<Sentence> doc = dp.getSentences(sent);
+ assertNotNull(doc);
+ assertEquals(3, doc.size());
+ for (Sentence sentence : doc) {
+ String[] words = dp.getWords(sentence.getStringVal());
+ assertNotNull(words);
+ assertTrue(words.length > 0);
+ assertTrue(words.length >= sentence.getWordCount()); // due to stop words not counted, this must hold.
+ }
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"\t", "\n", " "})
+ @NullAndEmptySource
+ void testGetWordsInvalid(String input) {
+ String[] words = dp.getWords(input);
+ assertNotNull(words);
+ assertEquals(0, words.length);
+ }
+
+ @Test
+ void testDocToString() throws IOException {
+ String content = dp.docToString("/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story");
+ assertNotNull(content);
+ assertFalse(content.isEmpty());
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"\t", "\n", " "})
+ @NullAndEmptySource
+ void testDocToStringInvalid(String input) throws IOException {
+ String content = dp.docToString(input);
+ assertNotNull(content);
+ assertTrue(content.isEmpty());
+ }
+
+ @Test
+ void testDocToSentences() throws IOException {
+ List<Sentence> content = dp.docToSentences("/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story");
+ assertNotNull(content);
+ assertFalse(content.isEmpty());
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"\t", "\n", " "})
+ @NullAndEmptySource
+ void testDocToSentencesInvalid(String input) throws IOException {
+ List<Sentence> content = dp.docToSentences(input);
+ assertNotNull(content);
+ assertTrue(content.isEmpty());
+ }
+}
diff --git a/summarizer/src/test/java/opennlp/summarization/preprocess/DocProcessorTest.java b/summarizer/src/test/java/opennlp/summarization/preprocess/DocProcessorTest.java
deleted file mode 100644
index ce31c26..0000000
--- a/summarizer/src/test/java/opennlp/summarization/preprocess/DocProcessorTest.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.summarization.preprocess;
-
-import java.util.List;
-
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import opennlp.summarization.Sentence;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-class DocProcessorTest {
-
- private static DefaultDocProcessor dp;
-
- @BeforeAll
- static void initEnv() {
- dp = new DefaultDocProcessor(DocProcessorTest.class.getResourceAsStream("/en-sent.bin"));
- }
-
- @Test
- void testGetSentencesFromStr() {
- String sent = "This is a sentence, with some punctuations; to test if the sentence breaker can handle it! Is every thing working OK ? Yes.";
- List<Sentence> doc = dp.getSentencesFromStr(sent);
- //dp.docToString(fileName);
- assertEquals(doc.size(), 3);
- }
-
-}
diff --git a/summarizer/src/test/java/opennlp/summarization/textrank/TextRankSummarizerTest.java b/summarizer/src/test/java/opennlp/summarization/textrank/TextRankSummarizerNewsTest.java
similarity index 88%
rename from summarizer/src/test/java/opennlp/summarization/textrank/TextRankSummarizerTest.java
rename to summarizer/src/test/java/opennlp/summarization/textrank/TextRankSummarizerNewsTest.java
index 31f89f6..39c3f19 100644
--- a/summarizer/src/test/java/opennlp/summarization/textrank/TextRankSummarizerTest.java
+++ b/summarizer/src/test/java/opennlp/summarization/textrank/TextRankSummarizerNewsTest.java
@@ -23,9 +23,9 @@
import org.junit.jupiter.api.BeforeEach;
/**
- * Tests the implementation of {@link TextRankSummarizer}.
+ * Tests the implementation of {@link TextRankSummarizer} via a small news texts sample.
*/
-public class TextRankSummarizerTest extends AbstractSummarizerTest {
+public class TextRankSummarizerNewsTest extends AbstractSummarizerTest {
// SUT
private Summarizer textRankSummarizer;
diff --git a/summarizer/src/test/resources/meta/Notes.txt b/summarizer/src/test/resources/Notes.txt
similarity index 100%
rename from summarizer/src/test/resources/meta/Notes.txt
rename to summarizer/src/test/resources/Notes.txt
diff --git a/summarizer/src/test/resources/en-pos-maxent.bin b/summarizer/src/test/resources/en-pos-maxent.bin
deleted file mode 100644
index 168f259..0000000
--- a/summarizer/src/test/resources/en-pos-maxent.bin
+++ /dev/null
Binary files differ
diff --git a/summarizer/src/test/resources/en-sent.bin b/summarizer/src/test/resources/en-sent.bin
deleted file mode 100644
index d3a2779..0000000
--- a/summarizer/src/test/resources/en-sent.bin
+++ /dev/null
Binary files differ
diff --git a/summarizer/src/test/resources/meta/idf.csv b/summarizer/src/test/resources/idf.csv
similarity index 100%
rename from summarizer/src/test/resources/meta/idf.csv
rename to summarizer/src/test/resources/idf.csv
diff --git a/summarizer/src/test/resources/meta/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story b/summarizer/src/test/resources/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story
rename to summarizer/src/test/resources/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story
diff --git a/summarizer/src/test/resources/meta/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story b/summarizer/src/test/resources/news/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story
rename to summarizer/src/test/resources/news/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story
diff --git a/summarizer/src/test/resources/meta/0a3040b6c1bba95efca727158f128a19c44ec8ba.story b/summarizer/src/test/resources/news/0a3040b6c1bba95efca727158f128a19c44ec8ba.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a3040b6c1bba95efca727158f128a19c44ec8ba.story
rename to summarizer/src/test/resources/news/0a3040b6c1bba95efca727158f128a19c44ec8ba.story
diff --git a/summarizer/src/test/resources/meta/0a3479b53796863a664c32ca20d8672583335d2a.story b/summarizer/src/test/resources/news/0a3479b53796863a664c32ca20d8672583335d2a.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a3479b53796863a664c32ca20d8672583335d2a.story
rename to summarizer/src/test/resources/news/0a3479b53796863a664c32ca20d8672583335d2a.story
diff --git a/summarizer/src/test/resources/meta/0a3639cb86487e72e2ba084211f99799918aedf8.story b/summarizer/src/test/resources/news/0a3639cb86487e72e2ba084211f99799918aedf8.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a3639cb86487e72e2ba084211f99799918aedf8.story
rename to summarizer/src/test/resources/news/0a3639cb86487e72e2ba084211f99799918aedf8.story
diff --git a/summarizer/src/test/resources/meta/0a4092bef1801863296777ebcfeceb1aec23c78f.story b/summarizer/src/test/resources/news/0a4092bef1801863296777ebcfeceb1aec23c78f.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a4092bef1801863296777ebcfeceb1aec23c78f.story
rename to summarizer/src/test/resources/news/0a4092bef1801863296777ebcfeceb1aec23c78f.story
diff --git a/summarizer/src/test/resources/meta/0a4324d4a5effa420aa95bb058314eab35c73852.story b/summarizer/src/test/resources/news/0a4324d4a5effa420aa95bb058314eab35c73852.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a4324d4a5effa420aa95bb058314eab35c73852.story
rename to summarizer/src/test/resources/news/0a4324d4a5effa420aa95bb058314eab35c73852.story
diff --git a/summarizer/src/test/resources/meta/0a5458d3427b290524a8df11d8503a5b57b32747.story b/summarizer/src/test/resources/news/0a5458d3427b290524a8df11d8503a5b57b32747.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a5458d3427b290524a8df11d8503a5b57b32747.story
rename to summarizer/src/test/resources/news/0a5458d3427b290524a8df11d8503a5b57b32747.story
diff --git a/summarizer/src/test/resources/meta/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story b/summarizer/src/test/resources/news/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story
rename to summarizer/src/test/resources/news/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story
diff --git a/summarizer/src/test/resources/meta/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story b/summarizer/src/test/resources/news/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story
similarity index 100%
rename from summarizer/src/test/resources/meta/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story
rename to summarizer/src/test/resources/news/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story