opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefBuilderWithNER.java - opennlp-sandbox - Git at Google

 package opennlp.tools.parse_thicket;

 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 import edu.stanford.nlp.dcoref.CorefChain;
 import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
 import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
 import edu.stanford.nlp.ie.AbstractSequenceClassifier;
 import edu.stanford.nlp.ie.crf.CRFClassifier;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
 import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
 import edu.stanford.nlp.trees.Tree;
 import edu.stanford.nlp.trees.TreeCoreAnnotations;
 import edu.stanford.nlp.util.CoreMap;
 import opennlp.tools.parse_thicket.ArcType;
 import opennlp.tools.parse_thicket.Pair;
 import opennlp.tools.parse_thicket.ParseCorefsBuilder;
 import opennlp.tools.parse_thicket.ParseThicket;
 import opennlp.tools.parse_thicket.ParseTreeNode;
 import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

 public class ParseCorefBuilderWithNER extends ParseCorefsBuilder {

 	private static ParseCorefBuilderWithNER instanceNER;

 	public synchronized static ParseCorefBuilderWithNER  getInstance() {
 		if (instanceNER == null)
 			instanceNER = new ParseCorefBuilderWithNER ();

 		return instanceNER;
 	}


 	AbstractSequenceClassifier<CoreLabel> classifier = null;

 	ParseCorefBuilderWithNER() {
 		super();
 		classifier = CRFClassifier.getDefaultClassifier();
 	}

 	public ParseThicket buildParseThicket(String text){
 		List<Tree> ptTrees = new ArrayList<Tree>();
 		// all numbering from 1, not 0
 		List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
 		List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
 		List<Float> sentimentProfile = new ArrayList<Float>();

 		annotation = new Annotation(text);
 		try {
 			pipeline.annotate(annotation);
 			List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
 			List<List<CoreLabel>> nerClassesText = classifier.classify(text);


 			int nSent = 0;
 			if (sentences != null && sentences.size() > 0)
 				for(CoreMap sentence: sentences){
 					List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
 					if (nSent>=nerClassesText.size())
 						break;
 					List<CoreLabel> nerClassesSent = nerClassesText .get(nSent);

 					// traversing the words in the current sentence
 					// a CoreLabel is a CoreMap with additional token-specific methods
 					Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
 					List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
 					int count=1;
 					for (CoreLabel token: coreLabelList ) {
 						if (count-1>=nerClassesSent.size())
 							break;
 						CoreLabel classNerWord = 	nerClassesSent .get(count-1);
 						// this is the text of the token
 						String lemma = token.get(TextAnnotation.class);
 						// this is the POS tag of the token
 						String pos = token.get(PartOfSpeechAnnotation.class);
 						// this is the NER label of the token
 						String ne = token.get(NamedEntityTagAnnotation.class);


 						ParseTreeNode p = new ParseTreeNode(lemma, pos, ne, count);
 						String ner = classNerWord .get(CoreAnnotations.AnswerAnnotation.class);
 						if (!ner.equals("O")){
 							Map<String, Object> nerMap = new HashMap<String, Object>();
 							nerMap.put("ner", ner);
 							p.setAttributes(nerMap);
 						}
 						nodes.add(p);
 						count++;
 					}
 					nSent++;
 					nodesThicket.add(nodes);
 					Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
 					// now sentiment for given sentence
 					Tree sentimentTree = sentence.get(SentimentAnnotatedTree.class);
 					float sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
 					sentimentProfile.add(sentiment);
 					ptTrees.add(tree);
 				}
 		} catch (Exception e) {
 			e.printStackTrace();
 		}


 		// now coreferences
 		Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
 		List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
 		for(CorefChain c: chains){
 			//System.out.println(c);
 			List<CorefMention> mentions = c.getMentionsInTextualOrder();
 			//System.out.println(mentions);
 			if (mentions.size()>1)
 				for(int i=0; i<mentions.size(); i++){
 					for(int j=i+1; j<mentions.size(); j++){
 						CorefMention mi = mentions.get(i), mj=mentions.get(j);


 						int niSentence = mi.position.get(0);
 						int niWord = mi.startIndex;
 						int njSentence = mj.position.get(0);
 						int njWord = mj.startIndex;

 						ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);

 						WordWordInterSentenceRelationArc arc =
 								new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord),
 										new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,
 										arcType);
 						arcs.add(arc);
 					}
 				}
 		}
 		List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
 		arcs.addAll(arcsCA);

 		ParseThicket result = new ParseThicket(ptTrees, arcs);
 		result.setSentimentProfile(sentimentProfile);
 		result.setNodesThicket(nodesThicket);
 		return result;
 	}

 	public static void main(String[] args){
 		new ParseCorefBuilderWithNER ().buildParseThicket("No one knows yet what General Prayuth's real intentions are. He has good reason to worry about resistance. "
 				+ "The pro-government Red-Shirt movement is far better organised than eight years ago, and could still be financed by former Prime Minister Thaksin Shinawatra's deep pockets.");
 	}

 }
	package opennlp.tools.parse_thicket;

	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	import edu.stanford.nlp.dcoref.CorefChain;
	import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
	import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
	import edu.stanford.nlp.ie.AbstractSequenceClassifier;
	import edu.stanford.nlp.ie.crf.CRFClassifier;
	import edu.stanford.nlp.ling.CoreAnnotations;
	import edu.stanford.nlp.ling.CoreLabel;
	import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
	import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
	import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
	import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
	import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
	import edu.stanford.nlp.pipeline.Annotation;
	import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
	import edu.stanford.nlp.trees.Tree;
	import edu.stanford.nlp.trees.TreeCoreAnnotations;
	import edu.stanford.nlp.util.CoreMap;
	import opennlp.tools.parse_thicket.ArcType;
	import opennlp.tools.parse_thicket.Pair;
	import opennlp.tools.parse_thicket.ParseCorefsBuilder;
	import opennlp.tools.parse_thicket.ParseThicket;
	import opennlp.tools.parse_thicket.ParseTreeNode;
	import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

	public class ParseCorefBuilderWithNER extends ParseCorefsBuilder {

	private static ParseCorefBuilderWithNER instanceNER;

	public synchronized static ParseCorefBuilderWithNER getInstance() {
	if (instanceNER == null)
	instanceNER = new ParseCorefBuilderWithNER ();

	return instanceNER;
	}


	AbstractSequenceClassifier<CoreLabel> classifier = null;

	ParseCorefBuilderWithNER() {
	super();
	classifier = CRFClassifier.getDefaultClassifier();
	}

	public ParseThicket buildParseThicket(String text){
	List<Tree> ptTrees = new ArrayList<Tree>();
	// all numbering from 1, not 0
	List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
	List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
	List<Float> sentimentProfile = new ArrayList<Float>();

	annotation = new Annotation(text);
	try {
	pipeline.annotate(annotation);
	List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
	List<List<CoreLabel>> nerClassesText = classifier.classify(text);


	int nSent = 0;
	if (sentences != null && sentences.size() > 0)
	for(CoreMap sentence: sentences){
	List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
	if (nSent>=nerClassesText.size())
	break;
	List<CoreLabel> nerClassesSent = nerClassesText .get(nSent);

	// traversing the words in the current sentence
	// a CoreLabel is a CoreMap with additional token-specific methods
	Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
	List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
	int count=1;
	for (CoreLabel token: coreLabelList ) {
	if (count-1>=nerClassesSent.size())
	break;
	CoreLabel classNerWord = nerClassesSent .get(count-1);
	// this is the text of the token
	String lemma = token.get(TextAnnotation.class);
	// this is the POS tag of the token
	String pos = token.get(PartOfSpeechAnnotation.class);
	// this is the NER label of the token
	String ne = token.get(NamedEntityTagAnnotation.class);


	ParseTreeNode p = new ParseTreeNode(lemma, pos, ne, count);
	String ner = classNerWord .get(CoreAnnotations.AnswerAnnotation.class);
	if (!ner.equals("O")){
	Map<String, Object> nerMap = new HashMap<String, Object>();
	nerMap.put("ner", ner);
	p.setAttributes(nerMap);
	}
	nodes.add(p);
	count++;
	}
	nSent++;
	nodesThicket.add(nodes);
	Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
	// now sentiment for given sentence
	Tree sentimentTree = sentence.get(SentimentAnnotatedTree.class);
	float sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
	sentimentProfile.add(sentiment);
	ptTrees.add(tree);
	}
	} catch (Exception e) {
	e.printStackTrace();
	}


	// now coreferences
	Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
	List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
	for(CorefChain c: chains){
	//System.out.println(c);
	List<CorefMention> mentions = c.getMentionsInTextualOrder();
	//System.out.println(mentions);
	if (mentions.size()>1)
	for(int i=0; i<mentions.size(); i++){
	for(int j=i+1; j<mentions.size(); j++){
	CorefMention mi = mentions.get(i), mj=mentions.get(j);


	int niSentence = mi.position.get(0);
	int niWord = mi.startIndex;
	int njSentence = mj.position.get(0);
	int njWord = mj.startIndex;

	ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);

	WordWordInterSentenceRelationArc arc =
	new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord),
	new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,
	arcType);
	arcs.add(arc);
	}
	}
	}
	List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
	arcs.addAll(arcsCA);

	ParseThicket result = new ParseThicket(ptTrees, arcs);
	result.setSentimentProfile(sentimentProfile);
	result.setNodesThicket(nodesThicket);
	return result;
	}

	public static void main(String[] args){
	new ParseCorefBuilderWithNER ().buildParseThicket("No one knows yet what General Prayuth's real intentions are. He has good reason to worry about resistance. "
	+ "The pro-government Red-Shirt movement is far better organised than eight years ago, and could still be financed by former Prime Minister Thaksin Shinawatra's deep pockets.");
	}

	}