| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.lang.english; |
| |
| import java.io.BufferedReader; |
| import java.io.FileReader; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| |
| import opennlp.tools.coref.DefaultLinker; |
| import opennlp.tools.coref.DiscourseEntity; |
| import opennlp.tools.coref.Linker; |
| import opennlp.tools.coref.LinkerMode; |
| import opennlp.tools.coref.mention.DefaultParse; |
| import opennlp.tools.coref.mention.Mention; |
| import opennlp.tools.coref.mention.MentionContext; |
| import opennlp.tools.coref.mention.PTBMentionFinder; |
| import opennlp.tools.parser.Parse; |
| import opennlp.tools.parser.chunking.Parser; |
| import opennlp.tools.util.Span; |
| |
| /** |
| * This class perform coreference for treebank style parses. |
| * It will only perform coreference over constituents defined in the trees and |
| * will not generate new constituents for pre-nominal entities or sub-entities in |
| * simple coordinated noun phrases. This linker requires that named-entity information also be provided. |
| * This information can be added to the parse using the -parse option with EnglishNameFinder. |
| * |
| * @deprecated will be removed soon! |
| */ |
| @Deprecated |
| public class TreebankLinker extends DefaultLinker { |
| |
| public TreebankLinker(String project, LinkerMode mode) throws IOException { |
| super(project,mode); |
| } |
| |
| public TreebankLinker(String project, LinkerMode mode, boolean useDiscourseModel) throws IOException { |
| super(project,mode,useDiscourseModel); |
| } |
| |
| public TreebankLinker(String project, LinkerMode mode, boolean useDiscourseModel, double fixedNonReferentialProbability) throws IOException { |
| super(project,mode,useDiscourseModel,fixedNonReferentialProbability); |
| } |
| |
| @Override |
| protected void initMentionFinder() { |
| mentionFinder = PTBMentionFinder.getInstance(headFinder); |
| } |
| |
| private static void showEntities(DiscourseEntity[] entities) { |
| for (int ei=0,en=entities.length;ei<en;ei++) { |
| System.out.println(ei+" "+entities[ei]); |
| } |
| } |
| |
| /** |
| * Identitifies corefernce relationships for parsed input passed via standard in. |
| * @param args The model directory. |
| * @throws IOException when the model directory can not be read. |
| */ |
| public static void main(String[] args) throws IOException { |
| if (args.length == 0) { |
| System.err.println("Usage: TreebankLinker model_directory < parses"); |
| System.exit(1); |
| } |
| BufferedReader in; |
| int ai =0; |
| String dataDir = args[ai++]; |
| if (ai == args.length) { |
| in = new BufferedReader(new InputStreamReader(System.in)); |
| } |
| else { |
| in = new BufferedReader(new FileReader(args[ai])); |
| } |
| Linker treebankLinker = new TreebankLinker(dataDir,LinkerMode.TEST); |
| int sentenceNumber = 0; |
| List<Mention> document = new ArrayList<Mention>(); |
| List<Parse> parses = new ArrayList<Parse>(); |
| for (String line=in.readLine();null != line;line = in.readLine()) { |
| if (line.equals("")) { |
| DiscourseEntity[] entities = treebankLinker.getEntities(document.toArray(new Mention[document.size()])); |
| //showEntities(entities); |
| new CorefParse(parses,entities).show(); |
| sentenceNumber=0; |
| document.clear(); |
| parses.clear(); |
| } |
| else { |
| Parse p = Parse.parseParse(line); |
| parses.add(p); |
| Mention[] extents = treebankLinker.getMentionFinder().getMentions(new DefaultParse(p,sentenceNumber)); |
| //construct new parses for mentions which don't have constituents. |
| for (int ei=0,en=extents.length;ei<en;ei++) { |
| //System.err.println("PennTreebankLiner.main: "+ei+" "+extents[ei]); |
| |
| if (extents[ei].getParse() == null) { |
| //not sure how to get head index, but its not used at this point. |
| Parse snp = new Parse(p.getText(),extents[ei].getSpan(),"NML",1.0,0); |
| p.insert(snp); |
| extents[ei].setParse(new DefaultParse(snp,sentenceNumber)); |
| } |
| |
| } |
| document.addAll(Arrays.asList(extents)); |
| sentenceNumber++; |
| } |
| } |
| if (document.size() > 0) { |
| DiscourseEntity[] entities = treebankLinker.getEntities(document.toArray(new Mention[document.size()])); |
| //showEntities(entities); |
| (new CorefParse(parses,entities)).show(); |
| } |
| } |
| } |
| |
| class CorefParse { |
| |
| private Map<Parse, Integer> parseMap; |
| private List<Parse> parses; |
| |
| public CorefParse(List<Parse> parses, DiscourseEntity[] entities) { |
| this.parses = parses; |
| parseMap = new HashMap<Parse, Integer>(); |
| for (int ei=0,en=entities.length;ei<en;ei++) { |
| if (entities[ei].getNumMentions() > 1) { |
| for (Iterator<MentionContext> mi = entities[ei].getMentions(); mi.hasNext();) { |
| MentionContext mc = mi.next(); |
| Parse mentionParse = ((DefaultParse) mc.getParse()).getParse(); |
| parseMap.put(mentionParse,ei+1); |
| //System.err.println("CorefParse: "+mc.getParse().hashCode()+" -> "+ (ei+1)); |
| } |
| } |
| } |
| } |
| |
| public void show() { |
| for (int pi=0,pn=parses.size();pi<pn;pi++) { |
| Parse p = parses.get(pi); |
| show(p); |
| System.out.println(); |
| } |
| } |
| |
| private void show(Parse p) { |
| int start; |
| start = p.getSpan().getStart(); |
| if (!p.getType().equals(Parser.TOK_NODE)) { |
| System.out.print("("); |
| System.out.print(p.getType()); |
| if (parseMap.containsKey(p)) { |
| System.out.print("#"+parseMap.get(p)); |
| } |
| //System.out.print(p.hashCode()+"-"+parseMap.containsKey(p)); |
| System.out.print(" "); |
| } |
| Parse[] children = p.getChildren(); |
| for (int pi=0,pn=children.length;pi<pn;pi++) { |
| Parse c = children[pi]; |
| Span s = c.getSpan(); |
| if (start < s.getStart()) { |
| System.out.print(p.getText().substring(start, s.getStart())); |
| } |
| show(c); |
| start = s.getEnd(); |
| } |
| System.out.print(p.getText().substring(start, p.getSpan().getEnd())); |
| if (!p.getType().equals(Parser.TOK_NODE)) { |
| System.out.print(")"); |
| } |
| } |
| } |