blob: 6ad1d224016d0566a0d6f2ebdb974baac5caaf9d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.summarization;
import java.text.BreakIterator;
import java.util.List;
import java.util.ArrayList;
import java.util.Locale;
import opennlp.summarization.preprocess.PorterStemmer;
import opennlp.summarization.preprocess.StopWords;
/*
* A representation of a sentence geared toward pagerank and summarization.
*/
public class Sentence {
//sentId is always position of sentence in doc..
private int sentId;
private String stringVal, procStringVal;
private Score pageRankScore;
private int paragraph;
private int paraPos;
private boolean hasQuote;
private double wordWt = 0;
private int wordCnt;
private List<Sentence> links;
private PorterStemmer stemmer;
public Sentence(){
links = new ArrayList<Sentence>();
}
public Sentence(int id){
this();
this.sentId = id;
}
public void setSentId(int sentId) {
this.sentId = sentId;
}
public int getSentId() {
return sentId;
}
public void setPageRankScore(Score pageRankScore) {
this.pageRankScore = pageRankScore;
}
public Score getPageRankScore() {
return pageRankScore;
}
public void setParagraph(int paragraph) {
this.paragraph = paragraph;
}
public int getParagraph() {
return paragraph;
}
public void setParaPos(int paraPos) {
this.paraPos = paraPos;
}
public int getParaPos() {
return paraPos;
}
public void setStringVal(String stringVal) {
this.stringVal = stringVal;
if(stringVal.contains("\"")) this.hasQuote = true;
this.wordCnt = calcWrdCnt(stringVal);
}
private int calcWrdCnt(String stringVal2) {
int ret = 0;
StopWords sw = StopWords.getInstance();
String[] wrds = stringVal.split(" ");
for(String wrd: wrds){
if(!sw.isStopWord(wrd)&&!wrd.startsWith("'")&&!wrd.equals(".")&&!wrd.equals("?"))
ret++;
}
return ret;
}
public String getStringVal() {
return stringVal;
}
public void addLink(Sentence s)
{
this.links.add(s);
}
public List<Sentence> getLinks()
{
return this.links;
}
public String toString()
{
return this.stringVal ;//+ "("+ this.paragraph +", "+this.paraPos+")";
}
public void setWordWt(double wordWt) {
this.wordWt = wordWt;
}
public double getWordWt() {
return wordWt;
}
public int getWordCnt()
{
return wordCnt==0? this.getStringVal().split(" ").length: wordCnt;
}
//Should add an article id to the sentence class.. For now returns true if the ids are the same..
public boolean equals(Object o){
if(! (o instanceof Sentence)) return false;
Sentence s = (Sentence)o;
if(s.sentId == this.sentId) return true;
return false;
}
static final String space=" ";
public String stem() {
PorterStemmer stemmer = new PorterStemmer();
StopWords sw = StopWords.getInstance();
BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
int wrdStrt = 0;
StringBuffer b = new StringBuffer();
wrdItr.setText(stringVal);
for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE;
wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
{
String word = this.getStringVal().substring(wrdStrt, wrdEnd);//words[i].trim();
word.replaceAll("\"|'","");
//Skip stop words and stem the word..
if(sw.isStopWord(word)) continue;
stemmer.stem(word);
b.append(stemmer.toString());
b.append(space);
}
// TODO Auto-generated method stub
return b.toString();
}
}