blob: 44c843cc25343c37acdeab498d48b45177705abb [file] [log] [blame]
package opennlp.tools.parse_thicket.external_rst;
import java.util.List;
import edu.arizona.sista.discourse.rstparser.DiscourseTree;
import edu.stanford.nlp.trees.Tree;
import opennlp.tools.parse_thicket.Pair;
import opennlp.tools.parse_thicket.ParseThicket;
import opennlp.tools.parse_thicket.ParseTreeNode;
import opennlp.tools.parse_thicket.VerbNetProcessor;
import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
import opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree;
/*
* This is subclass of ParseThicket with the focus on Discourse Tree
* It produces a representation of discourse tree for tree kernel learning
*/
public class ParseThicketWithDiscourseTree extends ParseThicket {
private DiscourseTree dt;
private String dtDump;
private String dtDumpWithPOS;
private String dtDumpWithEmbeddedTrees;
private String dtDumpWithVerbNet;
private TreeExtenderByAnotherLinkedTree extender = new TreeExtenderByAnotherLinkedTree();
private VerbNetProcessor verbBuilder = VerbNetProcessor.getInstance(null);
public DiscourseTree getDt() {
return dt;
}
// sets the highest level DT (under further iterations does not set anything)
public void setDt(DiscourseTree dt) {
if (this.dt==null)
this.dt = dt;
}
public ParseThicketWithDiscourseTree(List<Tree> ptTrees, List<WordWordInterSentenceRelationArc> barcs) {
super(ptTrees, barcs);
}
public void setDtDump(){
StringBuilder sb = new StringBuilder(100000);
StringBuilder res = toStringBuilderDTWithPOSSeq(sb, this.dt);
dtDumpWithPOS = res.toString();
sb = new StringBuilder(100000);
res = toStringBuilderDT(sb, this.dt);
dtDump = res.toString();
sb = new StringBuilder(100000);
res = toStringBuilderDTWithEmbeddedTrees(sb, this.dt);
dtDumpWithEmbeddedTrees = res.toString();
sb = new StringBuilder(100000);
res = toStringBuilderDTWithVerbNet(sb, this.dt);
dtDumpWithVerbNet = res.toString();
}
// basic representation of discourse tree
private StringBuilder toStringBuilderDT(StringBuilder sb, DiscourseTree dt) {
if (dt.isTerminal()) {
if (dt.relationLabel() != null) {
sb.append(dt.relationLabel());
//sb.append("("+dt.rawText()+")");
scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
dt.print(sbs, 0, false, true);
String text = sbs.replaceAllLiterally("Nucleus TEXT:", "(");
text = text.substring(0, text.length()-1)+")";
sb.append(text);
}
return sb;
} else {
sb.append('(');
if (dt.relationLabel() != null) {
sb.append(dt.relationLabel());
}
DiscourseTree[] kids = dt.children();
if (kids != null) {
for (DiscourseTree kid : kids) {
sb.append(' ');
toStringBuilderDT(sb, kid);
}
}
return sb.append(')');
}
}
private StringBuilder toStringBuilderDTWithPOSSeq(StringBuilder sb, DiscourseTree dt) {
if (dt.isTerminal()) {
if (dt.relationLabel() != null && dt.relationLabel().length()>2) {
sb.append(dt.relationLabel());
// different StrBuilder for trees from scala
scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
dt.print(sbs, 0, false, true);
String text = sbs.replaceAllLiterally("Nucleus TEXT:", "");
//text = text.substring(0, text.length()-1)+"";
String textDump = substituteTextWithPOStext(text, this.getNodesThicket().get(dt.firstToken().copy$default$1()));
sb.append(textDump);
}
return sb;
} else {
sb.append('(');
if (dt.relationLabel() != null) {
sb.append(dt.relationLabel());
}
DiscourseTree[] kids = dt.children();
if (kids != null) {
for (DiscourseTree kid : kids) {
sb.append(' ');
toStringBuilderDTWithPOSSeq(sb, kid);
}
}
return sb.append(')');
}
}
private String substituteTextWithPOStext(String text, List<ParseTreeNode> list) {
boolean bMatch = false;
String[] tokens = text.split(" ");
for(int offset = 0; offset<list.size(); offset++ ){
List<ParseTreeNode> subList = list.subList(offset, tokens.length+offset);
int count = 0;
bMatch = true; // if at least one mismatch
for(ParseTreeNode n: subList){
if (!n.getWord().equals(tokens[count])){
bMatch = false;
break;
} else
count++;
if (count>3)
break;
}
if (bMatch){
return //"(" +
ParseTreeNode.toTreeRepresentationString(subList); // + ")";
}
}
return null;
}
private StringBuilder toStringBuilderDTWithEmbeddedTrees(StringBuilder sb, DiscourseTree dt) {
if (dt.isTerminal()) {
if (dt.relationLabel() != null && dt.relationLabel().length()>2) {
sb.append(dt.relationLabel());
//sb.append("("+dt.rawText()+")");
scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
dt.print(sbs, 0, false, true);
String text = sbs.replaceAllLiterally("Nucleus TEXT:", "");
//text = text.substring(0, text.length()-1)+"";
substituteTextWithParseTree(sb, text, this.getSentenceTrees().get(dt.firstToken().copy$default$1()));
}
return sb;
} else {
sb.append('(');
if (dt.relationLabel() != null) {
sb.append(dt.relationLabel());
}
DiscourseTree[] kids = dt.children();
if (kids != null) {
for (DiscourseTree kid : kids) {
sb.append(' ');
toStringBuilderDTWithEmbeddedTrees(sb, kid);
}
}
return sb.append(')');
}
}
private void substituteTextWithParseTree(StringBuilder sb, String text, Tree sentenceTree) {
String[] tokens = text.split(" ");
List<Tree> foundTrees = null;
if (tokens.length>1){
foundTrees =
extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new String[]{tokens[0], tokens[1]});
}
else{
foundTrees =
extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new String[]{tokens[0]});
}
if (foundTrees == null || foundTrees.size()<1)
return;
extender.toStringBuilder(sb, foundTrees.get(0));
}
private StringBuilder toStringBuilderDTWithVerbNet(StringBuilder sb, DiscourseTree dt) {
if (dt.isTerminal()) {
if (dt.relationLabel() != null && dt.relationLabel().length()>2) {
sb.append(dt.relationLabel());
//sb.append("("+dt.rawText()+")");
scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
dt.print(sbs, 0, false, true);
String text = sbs.replaceAllLiterally("Nucleus TEXT:", "");
String textDump = null;
if (text.split(" ").length<100) // if not TOO long, more informative substitution, including VerbNets
textDump = substituteTextWithPOStextVerbNet(text, this.getNodesThicket().get(dt.firstToken().copy$default$1()));
else // otherwise just lemma-POS chains
textDump = substituteTextWithPOStext(text, this.getNodesThicket().get(dt.firstToken().copy$default$1()));
sb.append(textDump);
}
return sb;
} else {
sb.append('(');
if (dt.relationLabel() != null) {
sb.append(dt.relationLabel());
}
DiscourseTree[] kids = dt.children();
if (kids != null) {
for (DiscourseTree kid : kids) {
sb.append(' ');
toStringBuilderDTWithVerbNet(sb, kid);
}
}
return sb.append(')');
}
}
// substitutes lemma-POS pair instead of just lemma
// in case of verb provides moe detailed info
private String substituteTextWithPOStextVerbNet(String text, List<ParseTreeNode> list) {
boolean bMatch = false;
String[] tokens = text.split(" ");
for(int offset = 0; offset<list.size(); offset++ ){
List<ParseTreeNode> subList = list.subList(offset, tokens.length+offset);
int count = 0;
bMatch = true; // if at least one mismatch
for(ParseTreeNode n: subList){
if (!n.getWord().equals(tokens[count])){
bMatch = false;
break;
} else
count++;
if (count>3) // three tokens is enough for alignment
break;
}
// alignment found; now
if (bMatch){
StringBuilder buf = new StringBuilder();
for(ParseTreeNode ch: subList){
try {
if (ch.getPos().startsWith(".") || ch.getPos().startsWith(",") || ch.getPos().startsWith(";") || ch.getPos().startsWith("!"))
continue;
if (ch.getPos().startsWith("VB") && ch.getNormalizedWord()!=null){ // do more info for verbs
StringBuilder verbRepr = verbBuilder.
buildTreeRepresentationForTreeKernelLearning(ch.getNormalizedWord());
if (verbRepr!=null)
buf.append(" ("+verbRepr+") ");
else
buf.append( "("+ch.getWord()+ " " + ch.getPos() + ")" );
} else { // other than verb
buf.append( "("+ch.getWord()+ " " + ch.getPos() + ")" );
}
} catch (Exception e) {
e.printStackTrace();
}
}
return buf.toString().trim();
}
}
return null;
}
public String getDtDump() {
return this.dtDump;
}
public String getDtDumpWithPOS() {
return this.dtDumpWithPOS;
}
public String getDtDumpWithEmbeddedTrees() {
return this.dtDumpWithEmbeddedTrees;
}
public String getDtDumpWithVerbNet() {
return this.dtDumpWithVerbNet;
}
}