OPENNLP-575 Copied coref over to sandbox.

commit: 5f18ee88a8a007a9a076f40b69c5c6e65fb782e3 [log] [tgz]
author: Jörn Kottmann <joern@apache.org> Wed May 08 09:48:19 2013 +0000
committer: Jörn Kottmann <joern@apache.org> Wed May 08 09:48:19 2013 +0000
tree: 9272e782b065c02968802f2d396f4842ac76d367
parent: adf992f4346dd83be5ec3d06ee4992d1129e6c84 [diff]
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/FullParseCorefEnhancerStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/FullParseCorefEnhancerStream.java
new file mode 100644
index 0000000..0666843
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/FullParseCorefEnhancerStream.java

@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parser.AbstractBottomUpParser;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.Parser;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class FullParseCorefEnhancerStream extends FilterObjectStream<RawCorefSample, RawCorefSample> {
+
+  private final Parser parser;
+
+  public FullParseCorefEnhancerStream(Parser parser, ObjectStream<RawCorefSample> samples) {
+    super(samples);
+    this.parser = parser;
+  }
+  
+  static Parse createIncompleteParse(String tokens[]) {
+    
+    // produce text
+    Span tokenSpans[] = new Span[tokens.length];
+    StringBuilder textBuilder = new StringBuilder();
+    
+    for (int i = 0; i < tokens.length; i++) {
+      
+      if (textBuilder.length() > 0) {
+        textBuilder.append(' ');
+      }
+      
+      int startOffset = textBuilder.length();
+      textBuilder.append(tokens[i]);
+      tokenSpans[i] = new Span(startOffset, textBuilder.length());
+    }
+    
+    String text = textBuilder.toString();
+    
+    Parse p = new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 0, 0);
+    
+    for (int i = 0; i < tokenSpans.length; i++) {
+      Span tokenSpan = tokenSpans[i];
+      p.insert(new Parse(text, new Span(tokenSpan.getStart(), tokenSpan.getEnd()), AbstractBottomUpParser.TOK_NODE, 0, i));
+    }
+    
+    return p;
+  }
+  
+  public RawCorefSample read() throws IOException {
+    
+    RawCorefSample sample = samples.read();
+    
+    if (sample != null) {
+
+      List<Parse> enhancedParses = new ArrayList<Parse>();
+      
+      List<String[]> sentences = sample.getTexts();
+      
+      for (int i = 0; i < sentences.size(); i++) {
+        
+        String sentence[] = sentences.get(i);
+        
+        Parse incompleteParse = createIncompleteParse(sentence);
+        Parse p = parser.parse(incompleteParse);
+        
+        // What to do when a parse cannot be found ?!
+        
+        enhancedParses.add(p);
+      }
+      
+      sample.setParses(enhancedParses);
+      
+      return sample;
+    }
+    else {
+      return null;
+    }
+  }
+}

diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java
new file mode 100644
index 0000000..137c0f3
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java

@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.namefind.TokenNameFinderModelLoader;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.cmdline.parser.ParserModelLoader;
+import opennlp.tools.cmdline.tokenizer.TokenizerModelLoader;
+import opennlp.tools.coref.CorefSample;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.parser.Parser;
+import opennlp.tools.parser.ParserFactory;
+import opennlp.tools.parser.ParserModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Factory creates a stream which can parse MUC 6 Coref data and outputs CorefSample
+ * objects which are enhanced with a full parse and are suitable to train the Coreference component.
+ */
+public class Muc6FullParseCorefSampleStreamFactory extends AbstractSampleStreamFactory<CorefSample> {
+
+  interface Parameters extends BasicFormatParams {
+
+    @ParameterDescription(valueName = "modelFile")
+    File getParserModel();
+    
+    @ParameterDescription(valueName = "modelFile")
+    File getTokenizerModel();
+    
+    @ParameterDescription(valueName = "modelFile")
+    File getPersonModel();
+    
+    @ParameterDescription(valueName = "modelFile")
+    File getOrganizationModel();
+    
+    // TODO: Add other models here !!!
+  }
+  
+  protected Muc6FullParseCorefSampleStreamFactory() {
+    super(Parameters.class);
+  }
+
+  public ObjectStream<CorefSample> create(String[] args) {
+    
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    
+    ParserModel parserModel = new ParserModelLoader().load(params.getParserModel());
+    Parser parser =  ParserFactory.create(parserModel);
+    
+    TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
+    Tokenizer tokenizer = new TokenizerME(tokenizerModel);
+    
+    ObjectStream<String> mucDocStream = new FileToStringSampleStream(
+        new DirectorySampleStream(params.getData(), new FileFilter() {
+          
+          public boolean accept(File file) {
+            return file.getName().toLowerCase().endsWith(".sgm");
+          }
+        }, false), Charset.forName("UTF-8"));
+    
+    ObjectStream<RawCorefSample> rawSamples = 
+        new MucCorefSampleStream(tokenizer, mucDocStream);
+    
+    ObjectStream<RawCorefSample> parsedSamples = new FullParseCorefEnhancerStream(parser, rawSamples);
+    
+    
+    // How to load all these nameFinder models ?! 
+    // Lets make a param per model, not that nice, but ok!
+    
+    Map<String, File> modelFileTagMap = new HashMap<String, File>();
+    
+    modelFileTagMap.put("person", params.getPersonModel());
+    modelFileTagMap.put("organization", params.getOrganizationModel());
+    
+    List<TokenNameFinder> nameFinders = new ArrayList<TokenNameFinder>();
+    List<String> tags = new ArrayList<String>();
+    
+    for (Map.Entry<String, File> entry : modelFileTagMap.entrySet()) {
+      nameFinders.add(new NameFinderME(
+          new TokenNameFinderModelLoader().load(entry.getValue())));
+      tags.add(entry.getKey());
+    }
+    
+    return new MucMentionInserterStream(new NameFinderCorefEnhancerStream(nameFinders.toArray(
+        new TokenNameFinder[nameFinders.size()]),
+        tags.toArray(new String[tags.size()]), parsedSamples));
+  }
+  
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(CorefSample.class, "muc6full",
+        new Muc6FullParseCorefSampleStreamFactory());
+  }
+}

diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefContentHandler.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefContentHandler.java
new file mode 100644
index 0000000..d095b48
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefContentHandler.java

@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+// Note:
+// Take care for special @ sign handling (identifies a table or something else that should be ignored)
+class MucCorefContentHandler extends SgmlParser.ContentHandler {
+
+  static class CorefMention {
+    Span span;
+    int id;
+    String min;
+    
+    CorefMention(Span span, int id, String min) {
+      this.span = span;
+      this.id = id;
+      this.min = min;
+    }
+  }
+  
+  static final String COREF_ELEMENT = "COREF";
+  
+  private final Tokenizer tokenizer;
+  private final List<RawCorefSample> samples;
+  
+  boolean isInsideContentElement = false;
+  private final List<String> text = new ArrayList<String>();
+  private Stack<CorefMention> mentionStack = new Stack<CorefMention>();
+  private List<CorefMention> mentions = new ArrayList<MucCorefContentHandler.CorefMention>();
+
+  private Map<Integer, Integer> idMap = new HashMap<Integer, Integer>();
+
+  private RawCorefSample sample;
+  
+  MucCorefContentHandler(Tokenizer tokenizer, List<RawCorefSample> samples) {
+    this.tokenizer = tokenizer;
+    this.samples = samples;
+  }
+  
+  /**
+   * Resolve an id via the references to the root id.
+   * 
+   * @param id the id or reference to be resolved
+   * 
+   * @return the resolved id or -1 if id cannot be resolved
+   */
+  private int resolveId(int id) {
+    
+    Integer refId = idMap.get(id);
+    
+    if (refId != null) {
+      if (id == refId) {
+        return id;
+      }
+      else {
+        return resolveId(refId);
+      }
+    }
+    else {
+      return -1;
+    }
+  }
+  
+  @Override
+  public void startElement(String name, Map<String, String> attributes) {
+    
+    if (MucElementNames.DOC_ELEMENT.equals(name)) {
+      idMap.clear();
+      sample = new RawCorefSample(new ArrayList<String>(),
+          new ArrayList<MucCorefContentHandler.CorefMention[]>());
+    }
+    
+    if (MucElementNames.CONTENT_ELEMENTS.contains(name)) {
+      isInsideContentElement = true;
+    }
+    
+    if (COREF_ELEMENT.equals(name)) {
+      int beginOffset = text.size();
+      
+      String idString = attributes.get("ID");
+      String refString = attributes.get("REF");
+      
+      int id;
+      if (idString != null) {
+        id = Integer.parseInt(idString); // might fail
+        
+        if (refString == null) {
+          idMap.put(id, id);
+        }
+        else {
+          int ref = Integer.parseInt(refString);
+          idMap.put(id, ref);
+        }
+      }
+      else {
+        id = -1;
+        // throw invalid format exception ...
+      }
+        
+      mentionStack.push(new CorefMention(new Span(beginOffset, beginOffset), id, attributes.get("MIN")));
+    }
+  }
+  
+  @Override
+  public void characters(CharSequence chars) {
+    if (isInsideContentElement) {
+      
+      String tokens [] = tokenizer.tokenize(chars.toString());
+      
+      text.addAll(Arrays.asList(tokens));
+    }
+  }
+  
+  @Override
+  public void endElement(String name) {
+    
+    if (COREF_ELEMENT.equals(name)) {
+      CorefMention mention = mentionStack.pop();
+      mention.span = new Span(mention.span.getStart(), text.size());
+      mentions.add(mention);
+    }
+    
+    if (MucElementNames.CONTENT_ELEMENTS.contains(name)) {
+      
+      sample.getTexts().add(text.toArray(new String[text.size()]));
+      sample.getMentions().add(mentions.toArray(new CorefMention[mentions.size()]));
+      
+      mentions.clear();
+      text.clear();
+      isInsideContentElement = false;
+    }
+    
+    if (MucElementNames.DOC_ELEMENT.equals(name)) {
+      
+      for (CorefMention mentions[] : sample.getMentions()) {
+        for (int i = 0; i < mentions.length; i++) {
+          mentions[i].id = resolveId(mentions[i].id);
+        }
+      }
+      
+      samples.add(sample);
+    }
+  }
+}

diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefSampleStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefSampleStream.java
new file mode 100644
index 0000000..f139237
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefSampleStream.java

@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class MucCorefSampleStream extends FilterObjectStream<String, RawCorefSample> {
+
+  private final Tokenizer tokenizer;
+  
+  private List<RawCorefSample> documents = new ArrayList<RawCorefSample>();
+  
+  public MucCorefSampleStream(Tokenizer tokenizer, ObjectStream<String> documents) {
+    super(new DocumentSplitterStream(documents));
+    this.tokenizer = tokenizer;
+  }
+
+  public RawCorefSample read() throws IOException {
+    
+    if (documents.isEmpty()) {
+      
+      String document = samples.read();
+      
+      if (document != null) {
+        new SgmlParser().parse(new StringReader(document),
+            new MucCorefContentHandler(tokenizer, documents));
+      }
+    }
+    
+    if (documents.size() > 0) {
+      return documents.remove(0);
+    }
+    else {
+      return null;
+    }
+  }
+}

diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucMentionInserterStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucMentionInserterStream.java
new file mode 100644
index 0000000..95b9905
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucMentionInserterStream.java

@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.coref.CorefSample;
+import opennlp.tools.coref.mention.DefaultParse;
+import opennlp.tools.coref.mention.Mention;
+import opennlp.tools.coref.mention.MentionFinder;
+import opennlp.tools.coref.mention.PTBHeadFinder;
+import opennlp.tools.coref.mention.PTBMentionFinder;
+import opennlp.tools.formats.muc.MucCorefContentHandler.CorefMention;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * The mention insert is responsible to insert the mentions from the training data
+ * into the parse trees.
+ */
+public class MucMentionInserterStream extends FilterObjectStream<RawCorefSample, CorefSample> {
+
+  private static Set<String> entitySet = new HashSet<String>(Arrays.asList(DefaultParse.NAME_TYPES));
+  
+  private final MentionFinder mentionFinder;
+  
+  protected MucMentionInserterStream(ObjectStream<RawCorefSample> samples) {
+    super(samples);
+    
+    mentionFinder = PTBMentionFinder.getInstance(PTBHeadFinder.getInstance());
+  }
+
+  private static Span getMinSpan(Parse p, CorefMention mention) {
+    String min = mention.min;
+    
+    if (min != null) {
+      
+      int startOffset = p.toString().indexOf(min);
+      int endOffset = startOffset + min.length();
+      
+      Parse tokens[] = p.getTagNodes();
+      
+      int beginToken = -1;
+      int endToken = -1;
+      
+      for (int i = 0; i < tokens.length; i++) {
+        if (tokens[i].getSpan().getStart() == startOffset) {
+          beginToken = i;
+        }
+        
+        if (tokens[i].getSpan().getEnd() == endOffset) {
+          endToken = i + 1;
+          break;
+        }
+      }
+      
+      if (beginToken != -1 && endToken != -1) {
+        return new Span(beginToken, endToken);
+      }
+    }
+    
+    return null;
+  }
+  
+  public static boolean addMention(int id, Span mention, Parse[] tokens) {
+
+	boolean failed = false;
+    
+    Parse startToken = tokens[mention.getStart()];
+    Parse endToken = tokens[mention.getEnd() - 1];
+    Parse commonParent = startToken.getCommonParent(endToken);
+    
+    if (commonParent != null) {
+//      Span mentionSpan = new Span(startToken.getSpan().getStart(), endToken.getSpan().getEnd());
+      
+      if (entitySet.contains(commonParent.getType())) {
+        commonParent.getParent().setType("NP#" + id);            
+      }
+      else if (commonParent.getType().equals("NML")) {
+        commonParent.setType("NML#" + id);
+      }
+      else if (commonParent.getType().equals("NP")) {
+        commonParent.setType("NP#" + id);
+      }
+      else {
+        System.out.println("Inserting mention failed: " + commonParent.getType() + " Failed id: " + id);
+        failed = true;
+      }
+    }
+    else {
+      throw new IllegalArgumentException("Tokens must always have a common parent!");
+    }
+    
+    return !failed;
+  }
+  
+  public CorefSample read() throws IOException {
+    
+    RawCorefSample sample = samples.read();
+    
+    if (sample != null) {
+
+      List<Parse> mentionParses = new ArrayList<Parse>();
+      
+      List<CorefMention[]> allMentions = sample.getMentions();
+      List<Parse> allParses = sample.getParses();
+      
+      for (int si = 0; si < allMentions.size(); si++) {
+        CorefMention mentions[] = allMentions.get(si);
+        Parse p = allParses.get(si);
+        
+        for (Mention extent : mentionFinder.getMentions(new DefaultParse(p, si))) {
+          if (extent.getParse() == null) {
+            // not sure how to get head index
+            Parse snp = new Parse(p.getText(),extent.getSpan(),"NML",1.0,0);
+            p.insert(snp);
+          }
+        }
+        
+        Parse tokens[] = p.getTagNodes();
+        
+        for (CorefMention mention : mentions) {
+          Span min = getMinSpan(p, mention);
+          
+          if (min == null) {
+            min = mention.span;
+          }
+          
+          addMention(mention.id, min, tokens);
+        }
+        
+        p.show();
+        
+        mentionParses.add(p);
+      }
+      
+      return new CorefSample(mentionParses);
+    }
+    else {
+      return null;
+    }
+  }
+}

diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java
new file mode 100644
index 0000000..db350b9
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java

@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Adds names to the Coref Sample Stream.
+ */
+public class NameFinderCorefEnhancerStream extends FilterObjectStream<RawCorefSample, RawCorefSample> {
+
+  private TokenNameFinder nameFinders[];
+  private String tags[];
+  
+  // TODO: Should be updated to use tag from span instead!
+  protected NameFinderCorefEnhancerStream(TokenNameFinder nameFinders[], String tags[], ObjectStream<RawCorefSample> samples) {
+    super(samples);
+    this.nameFinders = nameFinders;
+    this.tags = tags;
+  }
+
+  public RawCorefSample read() throws IOException {
+    
+    RawCorefSample sample = samples.read();
+    
+    if (sample != null) {
+
+      for (TokenNameFinder namefinder : nameFinders) {
+        namefinder.clearAdaptiveData();
+      }
+      
+      List<Parse> parses = new ArrayList<Parse>();
+      
+      for (Parse p : sample.getParses()) {
+        
+        Parse parseTokens[] = p.getTagNodes();
+        String tokens[] = new String[parseTokens.length];
+        
+        for (int i = 0; i < tokens.length; i++) {
+          tokens[i] = parseTokens[i].toString();
+        }
+        
+        for (int i = 0; i < nameFinders.length; i++) {
+          Span names[] = nameFinders[i].find(tokens);
+          Parse.addNames(tags[i], names, parseTokens);
+        }
+        
+        parses.add(p);
+      }
+      
+      sample.setParses(parses);
+      
+      return sample;
+    }
+    else {
+      return null;
+    }
+  }
+}

diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/RawCorefSample.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/RawCorefSample.java
new file mode 100644
index 0000000..d2ae672
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/RawCorefSample.java

@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.formats.muc.MucCorefContentHandler.CorefMention;
+import opennlp.tools.parser.Parse;
+
+/**
+ * A coreference sample as it is extracted from MUC style training data.
+ */
+public class RawCorefSample {
+  
+  private List<String[]> texts = new ArrayList<String[]>();
+  private List<CorefMention[]> mentions = new ArrayList<CorefMention[]>();
+  
+  private List<Parse> parses;
+  
+  RawCorefSample(List<String> texts, List<CorefMention[]> mentions) {
+  }
+  
+  public List<String[]> getTexts() {
+    return texts;
+  }
+  
+  public List<CorefMention[]> getMentions() {
+    return mentions;
+  }
+  
+  void setParses(List<Parse> parses) {
+    this.parses = parses;
+  }
+  
+  List<Parse> getParses() {
+    return parses;
+  }
+}

diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/ShallowParseCorefEnhancerStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/ShallowParseCorefEnhancerStream.java
new file mode 100644
index 0000000..05a06f5
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/ShallowParseCorefEnhancerStream.java

@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.chunker.Chunker;
+import opennlp.tools.parser.AbstractBottomUpParser;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ShallowParseCorefEnhancerStream extends FilterObjectStream<RawCorefSample, RawCorefSample> {
+
+  private final POSTagger posTagger;
+  private final Chunker chunker;
+  
+  public ShallowParseCorefEnhancerStream(POSTagger posTagger, Chunker chunker, ObjectStream<RawCorefSample> samples) {
+    super(samples);
+    this.posTagger = posTagger;
+    this.chunker = chunker;
+  }
+  
+  public RawCorefSample read() throws IOException {
+    
+    RawCorefSample sample = samples.read();
+    
+    if (sample != null) {
+      
+      List<Parse> enhancedParses = new ArrayList<Parse>();
+      
+      List<String[]> sentences = sample.getTexts();
+      
+      for (String sentence[] : sentences) {
+        
+        Parse p = FullParseCorefEnhancerStream.createIncompleteParse(sentence);
+        p.setType(AbstractBottomUpParser.TOP_NODE);
+        
+        Parse parseTokens[] = p.getChildren();
+        
+        // construct incomplete parse here ..
+        String tags[] = posTagger.tag(sentence);
+        
+        for (int i = 0; i < parseTokens.length; i++) {
+          p.insert(new Parse(p.getText(), parseTokens[i].getSpan(), tags[i], 1d, parseTokens[i].getHeadIndex()));
+        }
+        
+        // insert tags into incomplete parse
+        Span chunks[] = chunker.chunkAsSpans(sentence, tags); 
+        
+        for (Span chunk : chunks) {
+          if ("NP".equals(chunk.getType())) {
+            p.insert(new Parse(p.getText(), new Span(0,0), chunk.getType(), 1d, p.getHeadIndex()));
+          }
+        }
+        
+        enhancedParses.add(p);
+      }
+      
+      sample.setParses(enhancedParses);      
+      
+      return sample;
+    }
+    else {
+      return null;
+    }
+  }
+}
commit	5f18ee88a8a007a9a076f40b69c5c6e65fb782e3	[log] [tgz]
author	Jörn Kottmann <joern@apache.org>	Wed May 08 09:48:19 2013 +0000
committer	Jörn Kottmann <joern@apache.org>	Wed May 08 09:48:19 2013 +0000
tree	9272e782b065c02968802f2d396f4842ac76d367
parent	adf992f4346dd83be5ec3d06ee4992d1129e6c84 [diff]