blob: e491aecc712a1f0837ddd7f9a401c11acdc3084b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.summarization.preprocess;
import java.io.BufferedInputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.io.LineNumberReader;
import java.text.BreakIterator;
import java.util.Collections;
import java.util.List;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Hashtable;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import opennlp.summarization.Sentence;
import opennlp.summarization.DocProcessor;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
import opennlp.tools.stemmer.Stemmer;
/**
* Parses a document to sentences..
*/
public class DefaultDocProcessor implements DocProcessor {
private SentenceModel sentModel;
private final Stemmer stemmer;
private final static Pattern REPLACEMENT_PATTERN =
Pattern.compile("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;");
// Sentence fragmentation to use..
private static final int OPEN_NLP = 1;
private static final int SIMPLE = 2;
private static final int SENTENCE_FRAG = OPEN_NLP;
public DefaultDocProcessor(InputStream fragModelFile) {
stemmer = new PorterStemmer();
try (InputStream modelIn = new BufferedInputStream(fragModelFile)){
sentModel = new SentenceModel(modelIn);
} catch(Exception ex){
Logger.getAnonymousLogger().info("Error while parsing.. Ignoring the line and marching on.. "+ ex.getMessage());
}
}
// Str - Document or para
// sentences - List containing returned sentences
// iidx - if not null update with the words in the sentence + sent id
// processedSent - Sentences after stemming and stopword removal..
private void getSentences(String str, List<String> sentences,
Hashtable<String, List<Integer>> iidx, List<String> processedSent) {
int oldSentEndIdx = 0;
int sentEndIdx = 0;
StopWords sw = StopWords.getInstance();
BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
iterator.setText(str);
int start = iterator.first();
int sentCnt = 0;
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
String sentence = str.substring(start,end);//str.substring(oldSentEndIdx, sentEndIdx).trim();
//Add the sentence as-is; do any processing at the word level
//To lower case and trim all punctuations
sentences.add(sentence);
wrdItr.setText(sentence);
StringBuilder procSent = new StringBuilder();
int wrdStrt = 0;
for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE;
wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
{
String word = sentence.substring(wrdStrt, wrdEnd);//words[i].trim();
word = word.replaceAll("\"|'","");
//Skip stop words and stem the word
if(sw.isStopWord(word)) continue;
String stemedWrd = stemmer.stem(word).toString();
//update iidx by adding the current sentence to the list
if(iidx!=null)
{
if(stemedWrd.length()>1)
{
List<Integer> sentList = iidx.get(stemedWrd);
if(sentList==null)
{
sentList = new ArrayList<>();
}
sentList.add(sentCnt);
//Save it back
iidx.put(stemedWrd, sentList);
}
}
procSent.append(stemedWrd).append(" ");
}
sentCnt++;
if(processedSent!=null )
processedSent.add(procSent.toString());
}
}
public String docToString(String fileName) {
StringBuilder docBuffer = new StringBuilder();
try (LineNumberReader lnr = new LineNumberReader(new FileReader(fileName))) {
String nextLine;
while ((nextLine = lnr.readLine()) != null) {
String trimmedLine = nextLine.trim();
if (!trimmedLine.isEmpty() ) {
docBuffer.append(REPLACEMENT_PATTERN.matcher(trimmedLine).replaceAll("")).append(" ");
}
}
} catch (Exception ex) {
Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
}
return docBuffer.toString();
}
//List of sentences form a document
public List<Sentence> docToSentList(String fileName) {
List<Sentence> sentList = new ArrayList<>();
try (LineNumberReader lnr = new LineNumberReader(new FileReader(fileName))) {
String nextLine;
int paraNo =0;
int sentNo = 0;
while ((nextLine = lnr.readLine()) != null) {
String trimmedLine = nextLine.trim();
if (!trimmedLine.isEmpty()) {
List<String> sents = new ArrayList<>();
List<String> cleanedSents = new ArrayList<>();
this.getSentences(trimmedLine, sents, null, cleanedSents);
int paraPos = 1;
for(String sen:sents) {
Sentence s = new Sentence();
s.setSentId(sentNo++);
s.setParagraph(paraNo);
s.setStringVal(sen);
s.setParaPos(paraPos++);
sentList.add(s);
}
paraNo++;
}
}
} catch (Exception ex) {
Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
}
return sentList;
}
@Override
public List<Sentence> getSentencesFromStr(String text) {
List<Sentence> ret = new ArrayList<>();
List<String> sentStrs = new ArrayList<>();
List<String> cleanedSents = new ArrayList<>();
//Custom/simple method if specified or open nlp model was not found
if(sentModel==null || SENTENCE_FRAG==SIMPLE)
getSentences(text, sentStrs, null, cleanedSents);
else{
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
String[] sentences = sentenceDetector.sentDetect(text);
Collections.addAll(sentStrs, sentences);
}
int sentNo = 0;
for(String sen:sentStrs)
{
Sentence s = new Sentence();
s.setSentId(sentNo);
s.setParagraph(1);
s.setStringVal(sen);
s.setParaPos(sentNo);
ret.add(s);
sentNo++;
}
return ret;
}
@Override
public String[] getWords(String sent)
{
return sent.trim().split("\\s+");
}
@Override
public Stemmer getStemmer() {
return stemmer;
}
}