blob: 083027630c66352191530f497c4f793d75b44a28 [file] [log] [blame]
package opennlp.tools.parse_thicket.matching;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import opennlp.tools.parse_thicket.IGeneralizer;
import opennlp.tools.parse_thicket.ParseCorefsBuilder;
import opennlp.tools.parse_thicket.ParseThicket;
import opennlp.tools.parse_thicket.ParseTreeNode;
import opennlp.tools.textsimilarity.LemmaPair;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();
PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();
Map<String, ParseThicket> parseThicketHash = new HashMap<String, ParseThicket>();
/** * The key function of similarity component which takes two portions of text
* and does similarity assessment by finding the set of all maximum common
* subtrees of the set of parse trees for each portion of text
*
* @param input
* text 1
* @param input
* text 2
* @return the matching results structure, which includes the similarity score
*/
public Matcher(){
}
public List<List<ParseTreeChunk>> assessRelevance(String para1, String para2) {
// first build PTs for each text
ParseThicket pt1 = ptBuilder.buildParseThicket(para1);
ParseThicket pt2 = ptBuilder.buildParseThicket(para2);
// then build phrases and rst arcs
List<List<ParseTreeNode>> phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);
// group phrases by type
List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1),
sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
List<List<ParseTreeChunk>> res = md
.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
return res;
}
public List<List<ParseTreeChunk>> assessRelevanceCache(String para1, String para2) {
// first build PTs for each text
ParseThicket pt1 = parseThicketHash.get(para1);
if (pt1==null){
pt1= ptBuilder.buildParseThicket(para1);
parseThicketHash.put(para1, pt1);
}
ParseThicket pt2 = parseThicketHash.get(para2);
if (pt2==null){
pt2= ptBuilder.buildParseThicket(para2);
parseThicketHash.put(para2, pt2);
}
// then build phrases and rst arcs
List<List<ParseTreeNode>> phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);
// group phrases by type
List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1),
sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
List<List<ParseTreeChunk>> res = md
.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
return res;
}
public List<List<ParseTreeChunk>> generalize(List<List<ParseTreeNode>> phrs1,
List<List<ParseTreeNode>> phrs2) {
// group phrases by type
List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1),
sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
List<List<ParseTreeChunk>> res = md
.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
return res;
}
private List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
List<List<ParseTreeNode>> phrs) {
List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(),
pps = new ArrayList<ParseTreeChunk>();
for(List<ParseTreeNode> ps:phrs){
ParseTreeChunk ch = convertNodeListIntoChunk(ps);
String ptype = ps.get(0).getPhraseType();
if (ptype.equals("NP")){
nps.add(ch);
} else if (ptype.equals("VP")){
vps.add(ch);
} else if (ptype.equals("PP")){
pps.add(ch);
}
}
results.add(nps); results.add(vps); results.add(pps);
return results;
}
private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) {
List<String> lemmas = new ArrayList<String>(), poss = new ArrayList<String>();
for(ParseTreeNode n: ps){
lemmas.add(n.getWord());
poss.add(n.getPos());
}
ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);
ch.setMainPOS(ps.get(0).getPhraseType());
return ch;
}
// this function is the main entry point into the PT builder if rst arcs are required
public ParseThicket buildParseThicketFromTextWithRST(String para){
ParseThicket pt = ptBuilder.buildParseThicket(para);
phraseBuilder.buildPT2ptPhrases(pt);
return pt;
}
@Override
public List<List<List<ParseTreeNode>>> generalize(Object o1, Object o2) {
// TODO Auto-generated method stub
return null;
}
}