blob: a6236982e1c94ffa52356322341bf73a8cc3b011 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.summarization.preprocess;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.LineNumberReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.text.BreakIterator;
import java.util.List;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Hashtable;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.summarization.Sentence;
import opennlp.summarization.DocProcessor;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.Stemmer;
/*
* Parse document to sentences..
*/
public class DefaultDocProcessor implements DocProcessor
{
SentenceModel sentModel;
Stemmer stemmer;
StopWords sw;
//Sentence fragmentation to use..
static int OPEN_NLP = 1;
static int SIMPLE = 2;
static int SENTENCE_FRAG= OPEN_NLP;
public DefaultDocProcessor(String fragModelFile){
try {
InputStream modelIn = new FileInputStream(fragModelFile);
sentModel = new SentenceModel(modelIn);
}catch(Exception ex){
Logger.getAnonymousLogger().info("Error while parsing.. Ignoring the line and marching on.. "+ ex.getMessage());
}
}
//Str - Document or para
//sentences - List containing returned sentences
// iidx - if not null update with the words in the sentence + sent id
// processedSent - Sentences after stemming and stopword removal..
private void getSentences(String str, List<String> sentences, Hashtable<String, List<Integer>> iidx, List<String> processedSent)
{
int oldSentEndIdx = 0;
int sentEndIdx = 0;
Stemmer stemmer = new PorterStemmer();
StopWords sw = StopWords.getInstance();
BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
iterator.setText(str);
int start = iterator.first();
int sentCnt = 0;
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next())
{
String sentence = str.substring(start,end);//str.substring(oldSentEndIdx, sentEndIdx).trim();
//Add the sentence as-is; do any processing at the word level..
//To lower case and trim all punctuations
sentences.add(sentence);
wrdItr.setText(sentence);
StringBuffer procSent = new StringBuffer();
int wrdStrt = 0;
for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE;
wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
{
String word = sentence.substring(wrdStrt, wrdEnd);//words[i].trim();
word.replaceAll("\"|'","");
//Skip stop words and stem the word..
if(sw.isStopWord(word)) continue;
String stemedWrd = stemmer.stem(word).toString();
//update iidx by adding the current sentence to the list..
if(iidx!=null)
{
if(stemedWrd.length()>1)
{
List<Integer> sentList= iidx.get(stemedWrd);
if(sentList==null)
{
sentList = new ArrayList<Integer>();
}
sentList.add(sentCnt);
//Save it back
iidx.put(stemedWrd, sentList);
}
}
procSent.append(stemedWrd+" ");
}
sentCnt++;
if(processedSent!=null )
processedSent.add(procSent.toString());
}
}
public String docToString(String fileName)
{
LineNumberReader lnr = null;
StringBuffer docBuffer = new StringBuffer();
try {
lnr = new LineNumberReader(new FileReader(fileName));
String nextLine;
while ((nextLine = lnr.readLine()) != null) {
String trimmedLine = nextLine.trim();
if (!trimmedLine.isEmpty() ) {
docBuffer.append(trimmedLine.replaceAll("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;", "")+" ");
}
}
} catch (Exception ex) {
Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
} finally {
try {
lnr.close();
} catch (IOException ex) {
Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
}
}
return docBuffer.toString();
}
//List of sentences form a document
public List<Sentence> docToSentList(String fileName)
{
List<Sentence> sentList = new ArrayList<Sentence>();
LineNumberReader lnr = null;
StringBuffer docBuffer = new StringBuffer();
try {
lnr = new LineNumberReader(new FileReader(fileName));
String nextLine;
int paraNo =0;
int sentNo = 0;
while ((nextLine = lnr.readLine()) != null) {
String trimmedLine = nextLine.trim();
if (!trimmedLine.isEmpty()) {
List<String> sents = new ArrayList<String>();
List<String> cleanedSents = new ArrayList<String>();
this.getSentences(trimmedLine, sents, null, cleanedSents);
int paraPos = 1;
for(String sen:sents)
{
Sentence s = new Sentence();
s.setSentId(sentNo++);
s.setParagraph(paraNo);
s.setStringVal(sen);
s.setParaPos(paraPos++);
sentList.add(s);
}
paraNo++;
}
}
String doc = docBuffer.toString();
} catch (Exception ex) {
Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
ex.printStackTrace();
} finally {
try {
lnr.close();
} catch (IOException ex) {
Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
}
}
return sentList;
}
public List<Sentence> getSentencesFromStr(String text) {
List<Sentence> ret = new ArrayList<Sentence>();
List<String> sentStrs = new ArrayList<String>();
List<String> cleanedSents = new ArrayList<String>();
//Custom/simple method if specified or open nlp model was not found..
if(sentModel==null || SENTENCE_FRAG==SIMPLE)
getSentences(text, sentStrs, null, cleanedSents);
else{
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
String[] sentences = sentenceDetector.sentDetect(text);
for(String sentence : sentences)
{
Logger.getLogger("DocProcessor").info(sentence);
sentStrs.add(sentence);
}
}
int sentNo = 0;
for(String sen:sentStrs)
{
Sentence s = new Sentence();
s.setSentId(sentNo);
s.setParagraph(1);
s.setStringVal(sen);
s.setParaPos(sentNo);
ret.add(s);
sentNo++;
}
return ret;
}
public String[] getWords(String sent)
{
return sent.split(" ");
}
@Override
public Stemmer getStemmer() {
// TODO Auto-generated method stub
return stemmer;
}
}