/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.parse_thicket.kernel_interface; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.logging.Logger; | |
import opennlp.tools.jsmlearning.ProfileReaderWriter; | |
import opennlp.tools.parse_thicket.ParseThicket; | |
import opennlp.tools.parse_thicket.ParseTreeNode; | |
import opennlp.tools.parse_thicket.VerbNetProcessor; | |
import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc; | |
import opennlp.tools.parse_thicket.matching.Matcher; | |
import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder; | |
import edu.stanford.nlp.trees.Tree; | |
public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { | |
private static Logger log = Logger | |
.getLogger("opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree"); | |
public List<String> buildForestForCorefArcs(ParseThicket pt){ | |
List<String> results = new ArrayList<String>(); | |
for(WordWordInterSentenceRelationArc arc: pt.getArcs()){ | |
//if (!arc.getArcType().getType().startsWith("coref")) | |
// continue; | |
int fromSent = arc.getCodeFrom().getFirst(); | |
int toSent = arc.getCodeTo().getFirst(); | |
if (fromSent <1 || toSent <1 ) // TODO problem in sentence enumeration => skip building extended trees | |
return results; | |
String wordFrom = arc.getLemmaFrom(); | |
String wordTo = arc.getLemmaTo(); | |
List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent-1), | |
pt.getSentences().get(fromSent-1), new String[]{ wordFrom}); | |
if (trees==null || trees.size()<1) | |
continue; | |
System.out.println(trees); | |
StringBuilder sb = new StringBuilder(10000); | |
toStringBuilderExtenderByAnotherLinkedTree1(sb, pt.getSentences().get(toSent-1), trees.get(0), new String[]{wordTo}); | |
System.out.println(sb.toString()); | |
results.add(sb.toString()); | |
} | |
// if no arcs then orig sentences | |
if (results.isEmpty()){ | |
for(Tree t: pt.getSentences()){ | |
results.add(t.toString()); | |
} | |
} | |
return results; | |
} | |
// sentences in pt are enumerarted starting from 0; | |
//this func works with Sista version of Stanford NLP and sentences are coded from 0 | |
public List<String> buildForestForRSTArcs(ParseThicket pt){ | |
List<String> results = new ArrayList<String>(); | |
for(WordWordInterSentenceRelationArc arc: pt.getArcs()){ | |
// TODO - uncomment | |
//if (!arc.getArcType().getType().startsWith("rst")) | |
// continue; | |
int fromSent = arc.getCodeFrom().getFirst(); | |
int toSent = arc.getCodeTo().getFirst(); | |
String wordFrom = arc.getLemmaFrom(); | |
String wordTo = arc.getLemmaTo(); | |
if (wordFrom == null || wordFrom.length()<1 || wordTo == null || wordTo.length()<1) | |
log.severe("Empty lemmas for RST arc "+ arc); | |
List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent), | |
pt.getSentences().get(fromSent), new String[]{ wordFrom}); | |
if (trees==null || trees.size()<1) | |
continue; | |
System.out.println(trees); | |
StringBuilder sb = new StringBuilder(10000); | |
Tree tree = trees.get(0); | |
// instead of phrase type for the root of the tree, we want to put the RST relation name | |
if (arc.getArcType().getType().startsWith("rst")) | |
tree.setValue(arc.getArcType().getSubtype()); | |
toStringBuilderExtenderByAnotherLinkedTree1(sb, pt.getSentences().get(toSent), tree, new String[]{wordTo}); | |
System.out.println(sb.toString()); | |
results.add(sb.toString()); | |
} | |
// if no arcs then orig sentences | |
if (results.isEmpty()){ | |
for(Tree t: pt.getSentences()){ | |
results.add(t.toString()); | |
} | |
} | |
return results; | |
} | |
public StringBuilder toStringBuilderExtenderByAnotherLinkedTree1(StringBuilder sb, Tree t, Tree treeToInsert, String[] corefWords) { | |
if (t.isLeaf()) { | |
if (t.label() != null) { | |
sb.append(t.label().value()); | |
} | |
return sb; | |
} else { | |
sb.append('('); | |
if (t.label() != null) { | |
if (t.value() != null) { | |
sb.append(t.label().value()); | |
} | |
} | |
boolean bInsertNow=false; | |
Tree[] kids = t.children(); | |
if (kids != null) { | |
for (Tree kid : kids) { | |
if (corefWords!=null){ | |
String word = corefWords[corefWords.length-1]; | |
String phraseStr = kid.toString(); | |
phraseStr=phraseStr.replace(")", ""); | |
if (phraseStr.endsWith(word)){ | |
bInsertNow=true; | |
} | |
} | |
} | |
if (bInsertNow){ | |
for (Tree kid : kids) { | |
sb.append(' '); | |
toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, null, null); | |
} | |
sb.append(' '); | |
toStringBuilderExtenderByAnotherLinkedTree1(sb, treeToInsert, null, null); | |
} else { | |
for (Tree kid : kids) { | |
sb.append(' '); | |
toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, treeToInsert, corefWords); | |
} | |
} | |
} | |
return sb.append(')'); | |
} | |
} | |
// given a parse tree and a | |
public List<Tree> getASubtreeWithRootAsNodeForWord1(Tree tree, Tree currentSubTree, String[] corefWords){ | |
if (currentSubTree.isLeaf()){ | |
return null; | |
} | |
List<Tree> result = null; | |
Tree[] kids = currentSubTree.children(); | |
if (kids != null) { | |
boolean bFound=false; | |
String word = corefWords[corefWords.length-1]; | |
for (Tree kid : kids) { | |
if (bFound){ | |
result.add(kid); | |
} else { | |
String phraseStr = kid.toString(); | |
phraseStr=phraseStr.replace(")", ""); | |
if (phraseStr.endsWith(word)){ // found | |
bFound=true; | |
result = new ArrayList<Tree>(); | |
} | |
} | |
} | |
if (bFound){ | |
return result; | |
} | |
// if not a selected node, proceed with iteration | |
for (Tree kid : kids) { | |
List<Tree> ts = getASubtreeWithRootAsNodeForWord1(tree, kid, corefWords); | |
if (ts!=null) | |
return ts; | |
} | |
} | |
return null; | |
} | |
// now obsolete | |
public Tree[] getASubtreeWithRootAsNodeForWord(Tree tree, Tree currentSubTree, String[] corefWords){ | |
if (currentSubTree.isLeaf()){ | |
return null; | |
} | |
boolean bInsertNow=false; | |
/*List<ParseTreeNode> bigTreeNodes = parsePhrase(currentSubTree.label().value()); | |
for(ParseTreeNode smallNode: bigTreeNodes ){ | |
if (bigTreeNodes.get(0).getWord().equals("") ) | |
continue; | |
String word = bigTreeNodes.get(0).getWord(); | |
for(String cWord: corefWords){ | |
if (word.equalsIgnoreCase(cWord)) | |
bInsertNow=true; | |
} | |
} */ | |
String nodePhraseStr = currentSubTree.toString(); | |
System.out.println(nodePhraseStr); | |
for(String w: corefWords) | |
nodePhraseStr = nodePhraseStr.replace(w, ""); | |
// all words are covered | |
if (nodePhraseStr.toUpperCase().equals(nodePhraseStr)) | |
bInsertNow=true; | |
//if(bInsertNow) | |
// return currentSubTree; | |
Tree[] kids = currentSubTree.children(); | |
if (kids != null) { | |
/*for (Tree kid : kids) { | |
List<ParseTreeNode> bigTreeNodes = parsePhrase(kid.label().value()); | |
if (bigTreeNodes!=null && bigTreeNodes.size()>0 && bigTreeNodes.get(0)!=null && | |
bigTreeNodes.get(0).getWord().equalsIgnoreCase(corefWords[0])){ | |
bInsertNow=true; | |
return kids; | |
} | |
}*/ | |
for (Tree kid : kids) { | |
Tree[] t = getASubtreeWithRootAsNodeForWord(tree, kid, corefWords); | |
if (t!=null) | |
return t; | |
} | |
} | |
return null; | |
} | |
public StringBuilder toStringBuilderExtenderByAnotherLinkedTree(StringBuilder sb, Tree t, Tree treeToInsert) { | |
if (t.isLeaf()) { | |
if (t.label() != null) { | |
sb.append(t.label().value()); | |
} | |
return sb; | |
} else { | |
sb.append('('); | |
if (t.label() != null) { | |
if (t.value() != null) { | |
sb.append(t.label().value()); | |
} | |
} | |
boolean bInsertNow=false; | |
// we try match trees to find out if we are at the insertion position | |
if (treeToInsert!=null){ | |
List<ParseTreeNode> bigTreeNodes = parsePhrase(t.label().value()); | |
List<ParseTreeNode> smallTreeNodes = parsePhrase(treeToInsert.getChild(0).getChild(0).getChild(0).label().value()); | |
System.out.println(t + " \n "+ treeToInsert+ "\n"); | |
if (smallTreeNodes.size()>0 && bigTreeNodes.size()>0) | |
for(ParseTreeNode smallNode: smallTreeNodes ){ | |
if (!bigTreeNodes.get(0).getWord().equals("") | |
&& bigTreeNodes.get(0).getWord().equalsIgnoreCase(smallNode.getWord())) | |
bInsertNow=true; | |
} | |
} | |
if (bInsertNow){ | |
Tree[] kids = t.children(); | |
if (kids != null) { | |
for (Tree kid : kids) { | |
sb.append(' '); | |
toStringBuilderExtenderByAnotherLinkedTree(sb, kid, null); | |
} | |
sb.append(' '); | |
toStringBuilderExtenderByAnotherLinkedTree(sb, treeToInsert.getChild(0).getChild(1), null); | |
int z=0; z++; | |
} | |
} else { | |
Tree[] kids = t.children(); | |
if (kids != null) { | |
for (Tree kid : kids) { | |
sb.append(' '); | |
toStringBuilderExtenderByAnotherLinkedTree(sb, kid, treeToInsert); | |
} | |
} | |
} | |
return sb.append(')'); | |
} | |
} | |
public StringBuilder toStringBuilder(StringBuilder sb, Tree t) { | |
if (t.isLeaf()) { | |
if (t.label() != null) { | |
sb.append(t.label().value()); | |
} | |
return sb; | |
} else { | |
sb.append('('); | |
if (t.label() != null) { | |
if (t.value() != null) { | |
sb.append(t.label().value()); | |
} | |
} | |
Tree[] kids = t.children(); | |
if (kids != null) { | |
for (Tree kid : kids) { | |
sb.append(' '); | |
toStringBuilder(sb, kid); | |
} | |
} | |
return sb.append(')'); | |
} | |
} | |
public static void main(String[] args){ | |
VerbNetProcessor p = VerbNetProcessor. | |
getInstance("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources"); | |
Matcher matcher = new Matcher(); | |
TreeExtenderByAnotherLinkedTree extender = new TreeExtenderByAnotherLinkedTree(); | |
ParseThicket pt = matcher.buildParseThicketFromTextWithRST(//"I went to the forest to look for a tree. I found out that it was thick and green"); | |
"Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons. "+ | |
"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " + | |
"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " + | |
"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "); | |
List<String> results = extender.buildForestForCorefArcs(pt); | |
System.out.println(results); | |
//System.exit(0); | |
List<Tree> forest = pt.getSentences(); | |
List<Tree> trees = extender.getASubtreeWithRootAsNodeForWord1(forest.get(1), forest.get(1), new String[]{"its"}); | |
System.out.println(trees); | |
StringBuilder sb = new StringBuilder(10000); | |
extender.toStringBuilderExtenderByAnotherLinkedTree1(sb, forest.get(0), trees.get(0), new String[]{"the", "forest"}); | |
System.out.println(sb.toString()); | |
// | |
//extender.toStringBuilderExtenderByAnotherLinkedTree(sb, forest.get(0), forest.get(1)); | |
//System.out.println(sb.toString()); | |
} | |
} |