blob: 9960079d85b10d10870d2cc113689f25629166ad [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.textsimilarity;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import opennlp.tools.lang.english.SentenceDetector;
import opennlp.tools.lang.english.Tokenizer;
import opennlp.tools.lang.english.TreebankParser;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.chunking.Parser;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.util.Span;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
public class ParserChunker2MatcherOlderOpenNLP {
public static final String resourcesDir = (System.getProperty("os.name")
.toLowerCase().indexOf("win") > -1 ? "C:/workspace/ZSearch/resources_external"
: "/var/search/solr-1.2/resources");
static private ParserChunker2MatcherOlderOpenNLP m_SyntMatcher = null;
private static final Logger LOG = LoggerFactory.getLogger(ParserChunker2MatcherOlderOpenNLP.class);
private SentenceDetectorME sentenceDetectorME = null;
private Tokenizer tokenizer = null;
private Parser parser = null;
private final boolean useTagDict = true;
private final boolean useCaseInsensitiveTagDict = false;
private final int beamSize = Parser.defaultBeamSize;
private final double advancePercentage = Parser.defaultAdvancePercentage;
private Map<String, List<List<ParseTreeChunk>>> parsingsCache = new HashMap<String, List<List<ParseTreeChunk>>>();
private ParseTreeChunkListScorer parseTreeChunkListScorer;
private ParseTreeMatcherDeterministic parseTreeMatcherDeterministic = new ParseTreeMatcherDeterministic();
/**
* Get the StopList singleton instance.
*
* @return The StopList
*/
static public ParserChunker2MatcherOlderOpenNLP getInstance() {
String dir = resourcesDir + "/models";
if (m_SyntMatcher == null) {
m_SyntMatcher = new ParserChunker2MatcherOlderOpenNLP();
try {
m_SyntMatcher.loadOpenNLP(dir);
} catch (Exception e) {
LOG.error("Problem loading openNLP! ", 2);
}
}
return m_SyntMatcher;
}
static public ParserChunker2MatcherOlderOpenNLP getInstance(String resourceDirSpec) {
String dir = resourceDirSpec + "/models";
if (m_SyntMatcher == null) {
m_SyntMatcher = new ParserChunker2MatcherOlderOpenNLP();
try {
m_SyntMatcher.loadOpenNLP(dir);
} catch (Exception e) {
e.printStackTrace();
LOG.error("Problem loading openNLP! ", e);
}
}
return m_SyntMatcher;
}
public ParserChunker2MatcherOlderOpenNLP() {
/*
* try { loadOpenNLP(resourcesDir); } catch (IOException e) {
* LOG.error("Problem loading openNLP! ", e); }
*/
}
public ParserChunker2MatcherOlderOpenNLP(String resourcesDir) {
try {
loadOpenNLP(resourcesDir);
} catch (IOException e) {
LOG.error("Problem loading openNLP! ", e);
}
}
public ParserChunker2MatcherOlderOpenNLP(String resourcesDir, String language) {
try {
loadOpenNLP(resourcesDir, language);
} catch (IOException e) {
LOG.error("Problem loading openNLP! ", e);
}
}
protected void loadOpenNLP(String dir) throws IOException {
sentenceDetectorME = new SentenceDetector(dir
+ "/sentdetect/EnglishSD.bin.gz");
tokenizer = new Tokenizer(dir + "/tokenize/EnglishTok.bin.gz");
parser = (Parser) TreebankParser.getParser(dir + "/parser", useTagDict,
useCaseInsensitiveTagDict, beamSize, advancePercentage);
}
protected void loadOpenNLP(String dir, String lang) throws IOException {
if (lang.equalsIgnoreCase("es")) {
sentenceDetectorME = new SentenceDetector(dir
+ "/sentdetect/EnglishSD.bin.gz");
tokenizer = new Tokenizer(dir + "/tokenize/EnglishTok.bin.gz");
parser = (Parser) TreebankParser.getParser(dir + "/parser", useTagDict,
useCaseInsensitiveTagDict, beamSize, advancePercentage);
}
}
// TODO is synchronized needed here?
public synchronized Parse[] parseLine(String line, Parser p, double confidence) {
String[] tokens = tokenizer.tokenize(line);
// tokens = TextProcessor.fastTokenize(line, false).toArray(new String[0]);
StringBuilder sb = new StringBuilder();
for (String t : tokens)
sb.append(t).append(" ");
Parse[] ps = null;
try {
ps = TreebankParser.parseLine(sb.toString(), parser, 2);
} catch (Exception e) {
System.out.println("Problem parsing " + sb.toString());
e.printStackTrace(); // unable to parse for whatever reason
}
int i = 1;
for (; i < ps.length; i++) {
if (ps[i - 1].getProb() - ps[i].getProb() > confidence)
break;
}
if (i < ps.length) {
Parse[] retp = new Parse[i];
for (int j = 0; j < i; j++)
retp[j] = ps[j];
return retp;
} else
return ps;
}
// TODO is synchronized needed here?
protected synchronized Double[] getPhrasingAcceptabilityData(String line) {
int nParsings = 5;
String[] tokens = tokenizer.tokenize(line);
int numWords = tokens.length;
StringBuilder sb = new StringBuilder();
for (String t : tokens)
sb.append(t).append(" ");
Double result[] = new Double[5];
Parse[] ps = null;
try {
ps = TreebankParser.parseLine(sb.toString(), parser, nParsings);
} catch (Exception e) {
// unable to parse for whatever reason
for (int i = 0; i < result.length; i++) {
result[i] = -20.0;
}
}
for (int i = 0; i < ps.length; i++) {
result[i] = Math.abs(ps[i].getProb() / (double) numWords);
}
return result;
}
protected boolean allChildNodesArePOSTags(Parse p) {
Parse[] subParses = p.getChildren();
for (int pi = 0; pi < subParses.length; pi++)
if (!((Parse) subParses[pi]).isPosTag())
return false;
return true;
}
protected ArrayList<String> getNounPhrases(Parse p) {
ArrayList<String> nounphrases = new ArrayList<String>();
Parse[] subparses = p.getChildren();
for (int pi = 0; pi < subparses.length; pi++) {
// System.out.println("Processing Label: " + subparses[pi].getLabel());
// System.out.println("Processing Type: " + subparses[pi].getType());
if (subparses[pi].getType().equals("NP")
&& allChildNodesArePOSTags(subparses[pi]))// &&
// ((Parse)subparses[pi]).getLabel()
// == "NP")
{
// System.out.println("Processing: " + subparses[pi].getLabel() +
// " as Chunk...");
Span _span = subparses[pi].getSpan();
nounphrases
.add(p.getText().substring(_span.getStart(), _span.getEnd()));
} else if (!((Parse) subparses[pi]).isPosTag())
nounphrases.addAll(getNounPhrases(subparses[pi]));
}
return nounphrases;
}
public List<LemmaPair> getAllPhrasesTWPairs(Parse p) {
List<String> nounphrases = new ArrayList<String>();
List<LemmaPair> LemmaPairs = new ArrayList<LemmaPair>();
Parse[] subparses = p.getChildren();
for (int pi = 0; pi < subparses.length; pi++) {
Span _span = subparses[pi].getSpan();
nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
String expr = p.getText().substring(_span.getStart(), _span.getEnd());
// if (expr.indexOf(" ")>0)
LemmaPairs.add(new LemmaPair(subparses[pi].getType(), expr, _span
.getStart()));
if (!((Parse) subparses[pi]).isPosTag())
LemmaPairs.addAll(getAllPhrasesTWPairs(subparses[pi]));
}
return LemmaPairs;
}
protected List<List<ParseTreeChunk>> matchOrigSentences(String sent1,
String sent2) {
// with tokenizer now
Parse[] parses1 = parseLine(sent1, parser, 1);
Parse[] parses2 = parseLine(sent2, parser, 1);
List<LemmaPair> origChunks1 = getAllPhrasesTWPairs(parses1[0]);
List<LemmaPair> origChunks2 = getAllPhrasesTWPairs(parses2[0]);
System.out.println(origChunks1);
System.out.println(origChunks2);
ParseTreeChunk matcher = new ParseTreeChunk();
List<List<ParseTreeChunk>> matchResult = matcher
.matchTwoSentencesGivenPairLists(origChunks1, origChunks2);
return matchResult;
}
public List<List<ParseTreeChunk>> matchOrigSentencesCache(String sent1,
String sent2) {
sent1 = sent1.replace("'s", " 's").replace(":", " ");
sent2 = sent2.replace("'s", " 's").replace(":", " ");
ParseTreeChunk matcher = new ParseTreeChunk();
List<List<ParseTreeChunk>> sent1GrpLst = null, sent2GrpLst = null;
sent1GrpLst = parsingsCache.get(sent1);
if (sent1GrpLst == null) {
List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
String[] sents1 = sentenceDetectorME.sentDetect(sent1);
for (String s1 : sents1) {
Parse[] parses1 = parseLine(s1, parser, 1);
origChunks1.addAll(getAllPhrasesTWPairs(parses1[0]));
}
List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
parsingsCache.put(sent1, sent1GrpLst);
System.out.println(origChunks1);
// System.out.println("=== Grouped chunks 1 "+ sent1GrpLst);
}
sent2GrpLst = parsingsCache.get(sent2);
if (sent2GrpLst == null) {
List<LemmaPair> origChunks2 = new ArrayList<LemmaPair>();
String[] sents2 = sentenceDetectorME.sentDetect(sent2);
for (String s2 : sents2) {
Parse[] parses2 = parseLine(s2, parser, 1);
origChunks2.addAll(getAllPhrasesTWPairs(parses2[0]));
}
List<ParseTreeChunk> chunk2List = matcher.buildChunks(origChunks2);
sent2GrpLst = matcher.groupChunksAsParses(chunk2List);
parsingsCache.put(sent2, sent2GrpLst);
System.out.println(origChunks2);
// System.out.println("=== Grouped chunks 2 "+ sent2GrpLst);
}
return parseTreeMatcherDeterministic
.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
}
public SentencePairMatchResult assessRelevance(String minedSent1, String sent2) {
minedSent1 = minedSent1.replace("'s", " 's").replace(":", " ")
.replace("’s", " 's");
sent2 = sent2.replace("'s", " 's").replace(":", " ").replace("’s", " 's");
ParseTreeChunk matcher = new ParseTreeChunk();
List<List<ParseTreeChunk>> sent1GrpLst = null, sent2GrpLst = null;
// sent1GrpLst = parsingsCache.get(minedSent1);
// if (sent1GrpLst==null){
List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
String[] sents1 = sentenceDetectorME.sentDetect(minedSent1);
for (String s1 : sents1) {
Parse[] parses1 = parseLine(s1, parser, 1);
origChunks1.addAll(getAllPhrasesTWPairs(parses1[0]));
}
List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
parsingsCache.put(minedSent1, sent1GrpLst);
// System.out.println(origChunks1);
// System.out.println("=== Grouped chunks 1 "+ sent1GrpLst);
// }
sent2GrpLst = parsingsCache.get(sent2);
if (sent2GrpLst == null) {
List<LemmaPair> origChunks2 = new ArrayList<LemmaPair>();
String[] sents2 = sentenceDetectorME.sentDetect(sent2);
for (String s2 : sents2) {
Parse[] parses2 = parseLine(s2, parser, 1);
origChunks2.addAll(getAllPhrasesTWPairs(parses2[0]));
}
List<ParseTreeChunk> chunk2List = matcher.buildChunks(origChunks2);
sent2GrpLst = matcher.groupChunksAsParses(chunk2List);
parsingsCache.put(sent2, sent2GrpLst);
// System.out.println(origChunks2);
// System.out.println("=== Grouped chunks 2 "+ sent2GrpLst);
}
ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
List<List<ParseTreeChunk>> res = md
.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
return new SentencePairMatchResult(res, origChunks1);
}
public Map<String, List<LemmaPair>> findMappingBetweenSentencesOfAParagraphAndAClassReps(
String para1, String classStr) {
// profile of matches
List<List<List<ParseTreeChunk>>> matchResultPerSentence = new ArrayList<List<List<ParseTreeChunk>>>();
ParseTreeChunk matcher = new ParseTreeChunk();
String[] sents = sentenceDetectorME.sentDetect(para1);
String[] classSents = sentenceDetectorME.sentDetect(classStr);
List<List<LemmaPair>> parseSentList = new ArrayList<List<LemmaPair>>();
for (String s : sents) {
parseSentList.add(getAllPhrasesTWPairs((parseLine(s, parser, 1)[0])));
}
List<List<LemmaPair>> parseClassList = new ArrayList<List<LemmaPair>>();
for (String s : classSents) {
parseClassList.add(getAllPhrasesTWPairs((parseLine(s, parser, 1)[0])));
}
Map<String, List<LemmaPair>> sentence_bestClassRep = new HashMap<String, List<LemmaPair>>();
for (List<LemmaPair> chunksSent : parseSentList) {
Double maxScore = -1.0;
for (List<LemmaPair> chunksClass : parseClassList) {
List<List<ParseTreeChunk>> matchResult = matcher
.matchTwoSentencesGivenPairLists(chunksSent, chunksClass);
Double score = parseTreeChunkListScorer
.getParseTreeChunkListScore(matchResult);
if (score > maxScore) {
maxScore = score;
sentence_bestClassRep.put(chunksSent.toString(), chunksClass);
}
}
}
return sentence_bestClassRep;
}
public SentenceDetectorME getSentenceDetectorME() {
return sentenceDetectorME;
}
public Parser getParser() {
return parser;
}
}
// -Xms500M -Xmx500M