blob: 3841592bcf7747ec4467d4fdb22b06fd8707149b [file] [log] [blame]
package opennlp.tools.parse_thicket;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import opennlp.tools.parse_thicket.ArcType;
import opennlp.tools.parse_thicket.Pair;
import opennlp.tools.parse_thicket.ParseCorefsBuilder;
import opennlp.tools.parse_thicket.ParseThicket;
import opennlp.tools.parse_thicket.ParseTreeNode;
import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
public class ParseCorefBuilderWithNER extends ParseCorefsBuilder {
private static ParseCorefBuilderWithNER instanceNER;
public synchronized static ParseCorefBuilderWithNER getInstance() {
if (instanceNER == null)
instanceNER = new ParseCorefBuilderWithNER ();
return instanceNER;
}
AbstractSequenceClassifier<CoreLabel> classifier = null;
ParseCorefBuilderWithNER() {
super();
classifier = CRFClassifier.getDefaultClassifier();
}
public ParseThicket buildParseThicket(String text){
List<Tree> ptTrees = new ArrayList<Tree>();
// all numbering from 1, not 0
List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
List<Float> sentimentProfile = new ArrayList<Float>();
annotation = new Annotation(text);
try {
pipeline.annotate(annotation);
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
List<List<CoreLabel>> nerClassesText = classifier.classify(text);
int nSent = 0;
if (sentences != null && sentences.size() > 0)
for(CoreMap sentence: sentences){
List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
if (nSent>=nerClassesText.size())
break;
List<CoreLabel> nerClassesSent = nerClassesText .get(nSent);
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
int count=1;
for (CoreLabel token: coreLabelList ) {
if (count-1>=nerClassesSent.size())
break;
CoreLabel classNerWord = nerClassesSent .get(count-1);
// this is the text of the token
String lemma = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
// this is the NER label of the token
String ne = token.get(NamedEntityTagAnnotation.class);
ParseTreeNode p = new ParseTreeNode(lemma, pos, ne, count);
String ner = classNerWord .get(CoreAnnotations.AnswerAnnotation.class);
if (!ner.equals("O")){
Map<String, Object> nerMap = new HashMap<String, Object>();
nerMap.put("ner", ner);
p.setAttributes(nerMap);
}
nodes.add(p);
count++;
}
nSent++;
nodesThicket.add(nodes);
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
// now sentiment for given sentence
Tree sentimentTree = sentence.get(SentimentAnnotatedTree.class);
float sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
sentimentProfile.add(sentiment);
ptTrees.add(tree);
}
} catch (Exception e) {
e.printStackTrace();
}
// now coreferences
Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
for(CorefChain c: chains){
//System.out.println(c);
List<CorefMention> mentions = c.getMentionsInTextualOrder();
//System.out.println(mentions);
if (mentions.size()>1)
for(int i=0; i<mentions.size(); i++){
for(int j=i+1; j<mentions.size(); j++){
CorefMention mi = mentions.get(i), mj=mentions.get(j);
int niSentence = mi.position.get(0);
int niWord = mi.startIndex;
int njSentence = mj.position.get(0);
int njWord = mj.startIndex;
ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);
WordWordInterSentenceRelationArc arc =
new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord),
new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,
arcType);
arcs.add(arc);
}
}
}
List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
arcs.addAll(arcsCA);
ParseThicket result = new ParseThicket(ptTrees, arcs);
result.setSentimentProfile(sentimentProfile);
result.setNodesThicket(nodesThicket);
return result;
}
public static void main(String[] args){
new ParseCorefBuilderWithNER ().buildParseThicket("No one knows yet what General Prayuth's real intentions are. He has good reason to worry about resistance. "
+ "The pro-government Red-Shirt movement is far better organised than eight years ago, and could still be financed by former Prime Minister Thaksin Shinawatra's deep pockets.");
}
}