blob: 8f215f73daf9853d89623ed517e3bdf9ded2642a [file] [log] [blame]
package opennlp.tools.parse_thicket;
import java.io.*;
import java.util.*;
import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.*;
public class ParseCorefsBuilder {
protected static ParseCorefsBuilder instance;
protected Annotation annotation;
protected StanfordCoreNLP pipeline;
CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder();
/**
* singleton method of instantiating the processor
*
* @return the instance
*/
public synchronized static ParseCorefsBuilder getInstance() {
if (instance == null)
instance = new ParseCorefsBuilder();
return instance;
}
protected ParseCorefsBuilder(){
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
pipeline = new StanfordCoreNLP(props);
}
public ParseThicket buildParseThicket(String text){
List<Tree> ptTrees = new ArrayList<Tree>();
// all numbering from 1, not 0
List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
annotation = new Annotation(text);
try {
pipeline.annotate(annotation);
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
if (sentences != null && sentences.size() > 0)
for(CoreMap sentence: sentences){
List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
int count=1;
for (CoreLabel token: coreLabelList ) {
// this is the text of the token
String lemma = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
// this is the NER label of the token
String ne = token.get(NamedEntityTagAnnotation.class);
nodes.add(new ParseTreeNode(lemma, pos, ne, count));
count++;
}
nodesThicket.add(nodes);
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
ptTrees.add(tree);
}
} catch (Exception e) {
e.printStackTrace();
}
// now coreferences
Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
for(CorefChain c: chains){
//System.out.println(c);
List<CorefMention> mentions = c.getMentionsInTextualOrder();
//System.out.println(mentions);
if (mentions.size()>1)
for(int i=0; i<mentions.size(); i++){
for(int j=i+1; j<mentions.size(); j++){
CorefMention mi = mentions.get(i), mj=mentions.get(j);
int niSentence = mi.position.get(0);
int niWord = mi.startIndex;
int njSentence = mj.position.get(0);
int njWord = mj.startIndex;
ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);
WordWordInterSentenceRelationArc arc =
new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord),
new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,
arcType);
arcs.add(arc);
}
}
}
List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
arcs.addAll(arcsCA);
ParseThicket result = new ParseThicket(ptTrees, arcs);
result.setNodesThicket(nodesThicket);
return result;
}
public List<WordWordInterSentenceRelationArc> buildCAarcs(
List<List<ParseTreeNode>> nodesThicket) {
List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
for(int sentI=0; sentI<nodesThicket.size(); sentI++){
for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){
List<ParseTreeNode> sentenceI = nodesThicket.get(sentI),
sentenceJ = nodesThicket.get(sentJ);
Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI);
Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ);
int indexCA1 = caFinder.findCAIndexInSentence(sentenceI);
int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ);
if (caI==null || caJ==null)
continue;
Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0);
ArcType arcType = new ArcType("ca",
caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
WordWordInterSentenceRelationArc arc =
new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1),
new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(),
arcType);
arcs.add(arc);
}
}
return arcs;
}
private String printNumArray(Integer[] arr){
StringBuffer buf = new StringBuffer();
for(Integer i: arr){
buf.append(Integer.toString(i)+ " ");
}
return buf.toString();
}
public static void main(String[] args) throws IOException {
ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
ParseThicket th = builder.buildParseThicket("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ");
//GraphFromPTreeBuilder gbuilder = new GraphFromPTreeBuilder();
//gbuilder.buildGraphFromPT(th);
}
}
/*
* [<sent=1-word=1..Iran> ===> <sent=3-word=9..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=1..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=4..its>, <sent=1-word=1..Iran> ===> <sent=4-word=17..it>, <sent=3-word=9..Iran> ===> <sent=4-word=1..Iran>, <sent=3-word=9..Iran> ===> <sent=4-word=4..its>, <sent=3-word=9..Iran> ===> <sent=4-word=17..it>, <sent=4-word=1..Iran> ===> <sent=4-word=4..its>, <sent=4-word=1..Iran> ===> <sent=4-word=17..it>, <sent=4-word=4..its> ===> <sent=4-word=17..it>, <sent=1-word=6..UN> ===> <sent=2-word=1..UN>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=10..its>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=13..its>, <sent=1-word=10..its> ===> <sent=1-word=13..its>, <sent=1-word=16..nuclear weapons> ===> <sent=3-word=14..nuclear weapons>, <sent=2-word=1..UN nuclear watchdog> ===> <sent=2-word=11..its>]
[[[1]Iran:NNP>LOCATION, [2]refuses:VBZ>O, [3]to:TO>O, [4]accept:VB>O, [5]the:DT>O, [6]UN:NNP>ORGANIZATION, [7]proposal:NN>O, [8]to:TO>O, [9]end:VB>O, [10]its:PRP$>O, [11]dispute:NN>O, [12]over:IN>O, [13]its:PRP$>O, [14]work:NN>O, [15]on:IN>O, [16]nuclear:JJ>O, [17]weapons:NNS>O, [18].:.>O],
[[1]UN:NNP>ORGANIZATION, [2]nuclear:JJ>O, [3]watchdog:NN>O, [4]passes:VBZ>O, [5]a:DT>O, [6]resolution:NN>O, [7]condemning:VBG>O, [8]Iran:NNP>LOCATION, [9]for:IN>O, [10]developing:VBG>O, [11]its:PRP$>O, [12]second:JJ>ORDINAL, [13]uranium:NN>O, [14]enrichment:NN>O, [15]site:NN>O, [16]in:IN>O, [17]secret:NN>O, [18].:.>O],
[[1]A:DT>O, [2]recent:JJ>O, [3]IAEA:NNP>ORGANIZATION, [4]report:NN>O, [5]presented:VBD>O, [6]diagrams:NNS>O, [7]that:WDT>O, [8]suggested:VBD>O, [9]Iran:NNP>LOCATION, [10]was:VBD>O, [11]secretly:RB>O, [12]working:VBG>O, [13]on:IN>O, [14]nuclear:JJ>O, [15]weapons:NNS>O, [16].:.>O],
[[1]Iran:NNP>LOCATION, [2]envoy:NN>O, [3]says:VBZ>O, [4]its:PRP$>O, [5]nuclear:JJ>O, [6]development:NN>O, [7]is:VBZ>O, [8]for:IN>O, [9]peaceful:JJ>O, [10]purpose:NN>O, [11],:,>O, [12]and:CC>O, [13]the:DT>O, [14]material:NN>O, [15]evidence:NN>O, [16]against:IN>O, [17]it:PRP>O, [18]has:VBZ>O, [19]been:VBN>O, [20]fabricated:VBN>O, [21]by:IN>O, [22]the:DT>O, [23]US:NNP>LOCATION, [24].:.>O]]
*/