opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java - opennlp-sandbox - Git at Google

 package opennlp.tools.parse_thicket;

 import java.io.*;
 import java.util.*;

 import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;

 import edu.stanford.nlp.dcoref.CorefChain;
 import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
 import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
 import edu.stanford.nlp.ling.*;
 import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
 import edu.stanford.nlp.pipeline.*;
 import edu.stanford.nlp.trees.*;
 import edu.stanford.nlp.util.*;

 public class ParseCorefsBuilder {
 	protected static ParseCorefsBuilder instance;
 	protected Annotation annotation;
 	protected StanfordCoreNLP pipeline;
 	CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder();

 	  /**
 	   * singleton method of instantiating the processor
 	   *
 	   * @return the instance
 	   */
 	  public synchronized static ParseCorefsBuilder getInstance() {
 	    if (instance == null)
 	      instance = new ParseCorefsBuilder();

 	    return instance;
 	  }

 	protected ParseCorefsBuilder(){
 		Properties props = new Properties();
 		props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
 		pipeline = new StanfordCoreNLP(props);
 	}

 	public ParseThicket buildParseThicket(String text){
 		List<Tree> ptTrees = new ArrayList<Tree>();
 		// all numbering from 1, not 0
 		List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
 		List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();

 		annotation = new Annotation(text);
 		try {
 			pipeline.annotate(annotation);
 			List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
 			if (sentences != null && sentences.size() > 0)
 			for(CoreMap sentence: sentences){
 				List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();

 				// traversing the words in the current sentence
 			    // a CoreLabel is a CoreMap with additional token-specific methods
 				Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
 				List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
 				int count=1;
 			    for (CoreLabel token: coreLabelList ) {
 			      // this is the text of the token
 			      String lemma = token.get(TextAnnotation.class);
 			      // this is the POS tag of the token
 			      String pos = token.get(PartOfSpeechAnnotation.class);
 			      // this is the NER label of the token
 			      String ne = token.get(NamedEntityTagAnnotation.class);
 			      nodes.add(new ParseTreeNode(lemma, pos, ne, count));
 			      count++;
 			    }
 			    nodesThicket.add(nodes);
 			  Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
 			  ptTrees.add(tree);
 			}
 		} catch (Exception e) {
 			e.printStackTrace();
 		}


 	    // now coreferences
 	    Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
 	    List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
 	    for(CorefChain c: chains){
 	      //System.out.println(c);
 	      List<CorefMention> mentions = c.getMentionsInTextualOrder();
 	      //System.out.println(mentions);
 	      if (mentions.size()>1)
 	      for(int i=0; i<mentions.size(); i++){
 	    	  for(int j=i+1; j<mentions.size(); j++){
 	    	  CorefMention mi = mentions.get(i), mj=mentions.get(j);


 	    	  int niSentence = mi.position.get(0);
 	    	  int niWord = mi.startIndex;
 	    	  int njSentence = mj.position.get(0);
 	    	  int njWord = mj.startIndex;

 	    	  ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);

 	    	  WordWordInterSentenceRelationArc arc =
 	    			  new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord),
 	    					  new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,
 	    					  arcType);
 	    	  arcs.add(arc);
 	    	  }
 	      }
 	    }
 	    List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
 	    arcs.addAll(arcsCA);

 	    ParseThicket result = new ParseThicket(ptTrees, arcs);
 	    result.setNodesThicket(nodesThicket);
 	    return result;
 	}

   public List<WordWordInterSentenceRelationArc> buildCAarcs(
 			List<List<ParseTreeNode>> nodesThicket) {
 	  List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();

 		for(int sentI=0; sentI<nodesThicket.size(); sentI++){
 			for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){
 				List<ParseTreeNode> sentenceI = nodesThicket.get(sentI),
 						sentenceJ = nodesThicket.get(sentJ);
 				Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI);
 				Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ);
 				int indexCA1 = caFinder.findCAIndexInSentence(sentenceI);
 				int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ);
 				if (caI==null || caJ==null)
 					continue;
 				Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0);

 				ArcType arcType = new ArcType("ca",
 						caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
 				 WordWordInterSentenceRelationArc arc =
 		    			  new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1),
 		    					  new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(),
 		    					  arcType);
 		    	  arcs.add(arc);

 			}
 					}

 		return arcs;
 	}

     private String printNumArray(Integer[] arr){
     	StringBuffer buf = new StringBuffer();
     	for(Integer i: arr){
     		buf.append(Integer.toString(i)+ " ");
     	}
     	return buf.toString();
     }

 public static void main(String[] args) throws IOException {
 	  ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
 	  ParseThicket  th = builder.buildParseThicket("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
     		  "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
     		  "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
     		  "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ");
     //GraphFromPTreeBuilder gbuilder = new GraphFromPTreeBuilder();
     //gbuilder.buildGraphFromPT(th);

   }

 }

 /*
  * [<sent=1-word=1..Iran> ===> <sent=3-word=9..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=1..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=4..its>, <sent=1-word=1..Iran> ===> <sent=4-word=17..it>, <sent=3-word=9..Iran> ===> <sent=4-word=1..Iran>, <sent=3-word=9..Iran> ===> <sent=4-word=4..its>, <sent=3-word=9..Iran> ===> <sent=4-word=17..it>, <sent=4-word=1..Iran> ===> <sent=4-word=4..its>, <sent=4-word=1..Iran> ===> <sent=4-word=17..it>, <sent=4-word=4..its> ===> <sent=4-word=17..it>, <sent=1-word=6..UN> ===> <sent=2-word=1..UN>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=10..its>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=13..its>, <sent=1-word=10..its> ===> <sent=1-word=13..its>, <sent=1-word=16..nuclear weapons> ===> <sent=3-word=14..nuclear weapons>, <sent=2-word=1..UN nuclear watchdog> ===> <sent=2-word=11..its>]

 [[[1]Iran:NNP>LOCATION, [2]refuses:VBZ>O, [3]to:TO>O, [4]accept:VB>O, [5]the:DT>O, [6]UN:NNP>ORGANIZATION, [7]proposal:NN>O, [8]to:TO>O, [9]end:VB>O, [10]its:PRP$>O, [11]dispute:NN>O, [12]over:IN>O, [13]its:PRP$>O, [14]work:NN>O, [15]on:IN>O, [16]nuclear:JJ>O, [17]weapons:NNS>O, [18].:.>O],

 [[1]UN:NNP>ORGANIZATION, [2]nuclear:JJ>O, [3]watchdog:NN>O, [4]passes:VBZ>O, [5]a:DT>O, [6]resolution:NN>O, [7]condemning:VBG>O, [8]Iran:NNP>LOCATION, [9]for:IN>O, [10]developing:VBG>O, [11]its:PRP$>O, [12]second:JJ>ORDINAL, [13]uranium:NN>O, [14]enrichment:NN>O, [15]site:NN>O, [16]in:IN>O, [17]secret:NN>O, [18].:.>O],

 [[1]A:DT>O, [2]recent:JJ>O, [3]IAEA:NNP>ORGANIZATION, [4]report:NN>O, [5]presented:VBD>O, [6]diagrams:NNS>O, [7]that:WDT>O, [8]suggested:VBD>O, [9]Iran:NNP>LOCATION, [10]was:VBD>O, [11]secretly:RB>O, [12]working:VBG>O, [13]on:IN>O, [14]nuclear:JJ>O, [15]weapons:NNS>O, [16].:.>O],

 [[1]Iran:NNP>LOCATION, [2]envoy:NN>O, [3]says:VBZ>O, [4]its:PRP$>O, [5]nuclear:JJ>O, [6]development:NN>O, [7]is:VBZ>O, [8]for:IN>O, [9]peaceful:JJ>O, [10]purpose:NN>O, [11],:,>O, [12]and:CC>O, [13]the:DT>O, [14]material:NN>O, [15]evidence:NN>O, [16]against:IN>O, [17]it:PRP>O, [18]has:VBZ>O, [19]been:VBN>O, [20]fabricated:VBN>O, [21]by:IN>O, [22]the:DT>O, [23]US:NNP>LOCATION, [24].:.>O]]
 */
	package opennlp.tools.parse_thicket;

	import java.io.*;
	import java.util.*;

	import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;

	import edu.stanford.nlp.dcoref.CorefChain;
	import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
	import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
	import edu.stanford.nlp.ling.*;
	import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
	import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
	import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
	import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
	import edu.stanford.nlp.pipeline.*;
	import edu.stanford.nlp.trees.*;
	import edu.stanford.nlp.util.*;

	public class ParseCorefsBuilder {
	protected static ParseCorefsBuilder instance;
	protected Annotation annotation;
	protected StanfordCoreNLP pipeline;
	CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder();

	/**
	* singleton method of instantiating the processor
	*
	* @return the instance
	*/
	public synchronized static ParseCorefsBuilder getInstance() {
	if (instance == null)
	instance = new ParseCorefsBuilder();

	return instance;
	}

	protected ParseCorefsBuilder(){
	Properties props = new Properties();
	props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
	pipeline = new StanfordCoreNLP(props);
	}

	public ParseThicket buildParseThicket(String text){
	List<Tree> ptTrees = new ArrayList<Tree>();
	// all numbering from 1, not 0
	List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
	List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();

	annotation = new Annotation(text);
	try {
	pipeline.annotate(annotation);
	List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
	if (sentences != null && sentences.size() > 0)
	for(CoreMap sentence: sentences){
	List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();

	// traversing the words in the current sentence
	// a CoreLabel is a CoreMap with additional token-specific methods
	Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
	List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
	int count=1;
	for (CoreLabel token: coreLabelList ) {
	// this is the text of the token
	String lemma = token.get(TextAnnotation.class);
	// this is the POS tag of the token
	String pos = token.get(PartOfSpeechAnnotation.class);
	// this is the NER label of the token
	String ne = token.get(NamedEntityTagAnnotation.class);
	nodes.add(new ParseTreeNode(lemma, pos, ne, count));
	count++;
	}
	nodesThicket.add(nodes);
	Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
	ptTrees.add(tree);
	}
	} catch (Exception e) {
	e.printStackTrace();
	}


	// now coreferences
	Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
	List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
	for(CorefChain c: chains){
	//System.out.println(c);
	List<CorefMention> mentions = c.getMentionsInTextualOrder();
	//System.out.println(mentions);
	if (mentions.size()>1)
	for(int i=0; i<mentions.size(); i++){
	for(int j=i+1; j<mentions.size(); j++){
	CorefMention mi = mentions.get(i), mj=mentions.get(j);


	int niSentence = mi.position.get(0);
	int niWord = mi.startIndex;
	int njSentence = mj.position.get(0);
	int njWord = mj.startIndex;

	ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);

	WordWordInterSentenceRelationArc arc =
	new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord),
	new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,
	arcType);
	arcs.add(arc);
	}
	}
	}
	List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
	arcs.addAll(arcsCA);

	ParseThicket result = new ParseThicket(ptTrees, arcs);
	result.setNodesThicket(nodesThicket);
	return result;
	}

	public List<WordWordInterSentenceRelationArc> buildCAarcs(
	List<List<ParseTreeNode>> nodesThicket) {
	List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();

	for(int sentI=0; sentI<nodesThicket.size(); sentI++){
	for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){
	List<ParseTreeNode> sentenceI = nodesThicket.get(sentI),
	sentenceJ = nodesThicket.get(sentJ);
	Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI);
	Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ);
	int indexCA1 = caFinder.findCAIndexInSentence(sentenceI);
	int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ);
	if (caI==null \|\| caJ==null)
	continue;
	Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0);

	ArcType arcType = new ArcType("ca",
	caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
	WordWordInterSentenceRelationArc arc =
	new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1),
	new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(),
	arcType);
	arcs.add(arc);

	}
	}

	return arcs;
	}

	private String printNumArray(Integer[] arr){
	StringBuffer buf = new StringBuffer();
	for(Integer i: arr){
	buf.append(Integer.toString(i)+ " ");
	}
	return buf.toString();
	}

	public static void main(String[] args) throws IOException {
	ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
	ParseThicket th = builder.buildParseThicket("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
	"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
	"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
	"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ");
	//GraphFromPTreeBuilder gbuilder = new GraphFromPTreeBuilder();
	//gbuilder.buildGraphFromPT(th);

	}

	}

	/*
	* [<sent=1-word=1..Iran> ===> <sent=3-word=9..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=1..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=4..its>, <sent=1-word=1..Iran> ===> <sent=4-word=17..it>, <sent=3-word=9..Iran> ===> <sent=4-word=1..Iran>, <sent=3-word=9..Iran> ===> <sent=4-word=4..its>, <sent=3-word=9..Iran> ===> <sent=4-word=17..it>, <sent=4-word=1..Iran> ===> <sent=4-word=4..its>, <sent=4-word=1..Iran> ===> <sent=4-word=17..it>, <sent=4-word=4..its> ===> <sent=4-word=17..it>, <sent=1-word=6..UN> ===> <sent=2-word=1..UN>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=10..its>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=13..its>, <sent=1-word=10..its> ===> <sent=1-word=13..its>, <sent=1-word=16..nuclear weapons> ===> <sent=3-word=14..nuclear weapons>, <sent=2-word=1..UN nuclear watchdog> ===> <sent=2-word=11..its>]

	[[[1]Iran:NNP>LOCATION, [2]refuses:VBZ>O, [3]to:TO>O, [4]accept:VB>O, [5]the:DT>O, [6]UN:NNP>ORGANIZATION, [7]proposal:NN>O, [8]to:TO>O, [9]end:VB>O, [10]its:PRP$>O, [11]dispute:NN>O, [12]over:IN>O, [13]its:PRP$>O, [14]work:NN>O, [15]on:IN>O, [16]nuclear:JJ>O, [17]weapons:NNS>O, [18].:.>O],

	[[1]UN:NNP>ORGANIZATION, [2]nuclear:JJ>O, [3]watchdog:NN>O, [4]passes:VBZ>O, [5]a:DT>O, [6]resolution:NN>O, [7]condemning:VBG>O, [8]Iran:NNP>LOCATION, [9]for:IN>O, [10]developing:VBG>O, [11]its:PRP$>O, [12]second:JJ>ORDINAL, [13]uranium:NN>O, [14]enrichment:NN>O, [15]site:NN>O, [16]in:IN>O, [17]secret:NN>O, [18].:.>O],

	[[1]A:DT>O, [2]recent:JJ>O, [3]IAEA:NNP>ORGANIZATION, [4]report:NN>O, [5]presented:VBD>O, [6]diagrams:NNS>O, [7]that:WDT>O, [8]suggested:VBD>O, [9]Iran:NNP>LOCATION, [10]was:VBD>O, [11]secretly:RB>O, [12]working:VBG>O, [13]on:IN>O, [14]nuclear:JJ>O, [15]weapons:NNS>O, [16].:.>O],

	[[1]Iran:NNP>LOCATION, [2]envoy:NN>O, [3]says:VBZ>O, [4]its:PRP$>O, [5]nuclear:JJ>O, [6]development:NN>O, [7]is:VBZ>O, [8]for:IN>O, [9]peaceful:JJ>O, [10]purpose:NN>O, [11],:,>O, [12]and:CC>O, [13]the:DT>O, [14]material:NN>O, [15]evidence:NN>O, [16]against:IN>O, [17]it:PRP>O, [18]has:VBZ>O, [19]been:VBN>O, [20]fabricated:VBN>O, [21]by:IN>O, [22]the:DT>O, [23]US:NNP>LOCATION, [24].:.>O]]
	*/