OPENNLP-575 Copied coref over to sandbox.
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/FullParseCorefEnhancerStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/FullParseCorefEnhancerStream.java
new file mode 100644
index 0000000..0666843
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/FullParseCorefEnhancerStream.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parser.AbstractBottomUpParser;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.Parser;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class FullParseCorefEnhancerStream extends FilterObjectStream<RawCorefSample, RawCorefSample> {
+
+ private final Parser parser;
+
+ public FullParseCorefEnhancerStream(Parser parser, ObjectStream<RawCorefSample> samples) {
+ super(samples);
+ this.parser = parser;
+ }
+
+ static Parse createIncompleteParse(String tokens[]) {
+
+ // produce text
+ Span tokenSpans[] = new Span[tokens.length];
+ StringBuilder textBuilder = new StringBuilder();
+
+ for (int i = 0; i < tokens.length; i++) {
+
+ if (textBuilder.length() > 0) {
+ textBuilder.append(' ');
+ }
+
+ int startOffset = textBuilder.length();
+ textBuilder.append(tokens[i]);
+ tokenSpans[i] = new Span(startOffset, textBuilder.length());
+ }
+
+ String text = textBuilder.toString();
+
+ Parse p = new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 0, 0);
+
+ for (int i = 0; i < tokenSpans.length; i++) {
+ Span tokenSpan = tokenSpans[i];
+ p.insert(new Parse(text, new Span(tokenSpan.getStart(), tokenSpan.getEnd()), AbstractBottomUpParser.TOK_NODE, 0, i));
+ }
+
+ return p;
+ }
+
+ public RawCorefSample read() throws IOException {
+
+ RawCorefSample sample = samples.read();
+
+ if (sample != null) {
+
+ List<Parse> enhancedParses = new ArrayList<Parse>();
+
+ List<String[]> sentences = sample.getTexts();
+
+ for (int i = 0; i < sentences.size(); i++) {
+
+ String sentence[] = sentences.get(i);
+
+ Parse incompleteParse = createIncompleteParse(sentence);
+ Parse p = parser.parse(incompleteParse);
+
+ // What to do when a parse cannot be found ?!
+
+ enhancedParses.add(p);
+ }
+
+ sample.setParses(enhancedParses);
+
+ return sample;
+ }
+ else {
+ return null;
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java
new file mode 100644
index 0000000..137c0f3
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.namefind.TokenNameFinderModelLoader;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.cmdline.parser.ParserModelLoader;
+import opennlp.tools.cmdline.tokenizer.TokenizerModelLoader;
+import opennlp.tools.coref.CorefSample;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.parser.Parser;
+import opennlp.tools.parser.ParserFactory;
+import opennlp.tools.parser.ParserModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Factory creates a stream which can parse MUC 6 Coref data and outputs CorefSample
+ * objects which are enhanced with a full parse and are suitable to train the Coreference component.
+ */
+public class Muc6FullParseCorefSampleStreamFactory extends AbstractSampleStreamFactory<CorefSample> {
+
+ interface Parameters extends BasicFormatParams {
+
+ @ParameterDescription(valueName = "modelFile")
+ File getParserModel();
+
+ @ParameterDescription(valueName = "modelFile")
+ File getTokenizerModel();
+
+ @ParameterDescription(valueName = "modelFile")
+ File getPersonModel();
+
+ @ParameterDescription(valueName = "modelFile")
+ File getOrganizationModel();
+
+ // TODO: Add other models here !!!
+ }
+
+ protected Muc6FullParseCorefSampleStreamFactory() {
+ super(Parameters.class);
+ }
+
+ public ObjectStream<CorefSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ ParserModel parserModel = new ParserModelLoader().load(params.getParserModel());
+ Parser parser = ParserFactory.create(parserModel);
+
+ TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
+ Tokenizer tokenizer = new TokenizerME(tokenizerModel);
+
+ ObjectStream<String> mucDocStream = new FileToStringSampleStream(
+ new DirectorySampleStream(params.getData(), new FileFilter() {
+
+ public boolean accept(File file) {
+ return file.getName().toLowerCase().endsWith(".sgm");
+ }
+ }, false), Charset.forName("UTF-8"));
+
+ ObjectStream<RawCorefSample> rawSamples =
+ new MucCorefSampleStream(tokenizer, mucDocStream);
+
+ ObjectStream<RawCorefSample> parsedSamples = new FullParseCorefEnhancerStream(parser, rawSamples);
+
+
+ // How to load all these nameFinder models ?!
+ // Lets make a param per model, not that nice, but ok!
+
+ Map<String, File> modelFileTagMap = new HashMap<String, File>();
+
+ modelFileTagMap.put("person", params.getPersonModel());
+ modelFileTagMap.put("organization", params.getOrganizationModel());
+
+ List<TokenNameFinder> nameFinders = new ArrayList<TokenNameFinder>();
+ List<String> tags = new ArrayList<String>();
+
+ for (Map.Entry<String, File> entry : modelFileTagMap.entrySet()) {
+ nameFinders.add(new NameFinderME(
+ new TokenNameFinderModelLoader().load(entry.getValue())));
+ tags.add(entry.getKey());
+ }
+
+ return new MucMentionInserterStream(new NameFinderCorefEnhancerStream(nameFinders.toArray(
+ new TokenNameFinder[nameFinders.size()]),
+ tags.toArray(new String[tags.size()]), parsedSamples));
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(CorefSample.class, "muc6full",
+ new Muc6FullParseCorefSampleStreamFactory());
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefContentHandler.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefContentHandler.java
new file mode 100644
index 0000000..d095b48
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefContentHandler.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+// Note:
+// Take care for special @ sign handling (identifies a table or something else that should be ignored)
+class MucCorefContentHandler extends SgmlParser.ContentHandler {
+
+ static class CorefMention {
+ Span span;
+ int id;
+ String min;
+
+ CorefMention(Span span, int id, String min) {
+ this.span = span;
+ this.id = id;
+ this.min = min;
+ }
+ }
+
+ static final String COREF_ELEMENT = "COREF";
+
+ private final Tokenizer tokenizer;
+ private final List<RawCorefSample> samples;
+
+ boolean isInsideContentElement = false;
+ private final List<String> text = new ArrayList<String>();
+ private Stack<CorefMention> mentionStack = new Stack<CorefMention>();
+ private List<CorefMention> mentions = new ArrayList<MucCorefContentHandler.CorefMention>();
+
+ private Map<Integer, Integer> idMap = new HashMap<Integer, Integer>();
+
+ private RawCorefSample sample;
+
+ MucCorefContentHandler(Tokenizer tokenizer, List<RawCorefSample> samples) {
+ this.tokenizer = tokenizer;
+ this.samples = samples;
+ }
+
+ /**
+ * Resolve an id via the references to the root id.
+ *
+ * @param id the id or reference to be resolved
+ *
+ * @return the resolved id or -1 if id cannot be resolved
+ */
+ private int resolveId(int id) {
+
+ Integer refId = idMap.get(id);
+
+ if (refId != null) {
+ if (id == refId) {
+ return id;
+ }
+ else {
+ return resolveId(refId);
+ }
+ }
+ else {
+ return -1;
+ }
+ }
+
+ @Override
+ public void startElement(String name, Map<String, String> attributes) {
+
+ if (MucElementNames.DOC_ELEMENT.equals(name)) {
+ idMap.clear();
+ sample = new RawCorefSample(new ArrayList<String>(),
+ new ArrayList<MucCorefContentHandler.CorefMention[]>());
+ }
+
+ if (MucElementNames.CONTENT_ELEMENTS.contains(name)) {
+ isInsideContentElement = true;
+ }
+
+ if (COREF_ELEMENT.equals(name)) {
+ int beginOffset = text.size();
+
+ String idString = attributes.get("ID");
+ String refString = attributes.get("REF");
+
+ int id;
+ if (idString != null) {
+ id = Integer.parseInt(idString); // might fail
+
+ if (refString == null) {
+ idMap.put(id, id);
+ }
+ else {
+ int ref = Integer.parseInt(refString);
+ idMap.put(id, ref);
+ }
+ }
+ else {
+ id = -1;
+ // throw invalid format exception ...
+ }
+
+ mentionStack.push(new CorefMention(new Span(beginOffset, beginOffset), id, attributes.get("MIN")));
+ }
+ }
+
+ @Override
+ public void characters(CharSequence chars) {
+ if (isInsideContentElement) {
+
+ String tokens [] = tokenizer.tokenize(chars.toString());
+
+ text.addAll(Arrays.asList(tokens));
+ }
+ }
+
+ @Override
+ public void endElement(String name) {
+
+ if (COREF_ELEMENT.equals(name)) {
+ CorefMention mention = mentionStack.pop();
+ mention.span = new Span(mention.span.getStart(), text.size());
+ mentions.add(mention);
+ }
+
+ if (MucElementNames.CONTENT_ELEMENTS.contains(name)) {
+
+ sample.getTexts().add(text.toArray(new String[text.size()]));
+ sample.getMentions().add(mentions.toArray(new CorefMention[mentions.size()]));
+
+ mentions.clear();
+ text.clear();
+ isInsideContentElement = false;
+ }
+
+ if (MucElementNames.DOC_ELEMENT.equals(name)) {
+
+ for (CorefMention mentions[] : sample.getMentions()) {
+ for (int i = 0; i < mentions.length; i++) {
+ mentions[i].id = resolveId(mentions[i].id);
+ }
+ }
+
+ samples.add(sample);
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefSampleStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefSampleStream.java
new file mode 100644
index 0000000..f139237
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucCorefSampleStream.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class MucCorefSampleStream extends FilterObjectStream<String, RawCorefSample> {
+
+ private final Tokenizer tokenizer;
+
+ private List<RawCorefSample> documents = new ArrayList<RawCorefSample>();
+
+ public MucCorefSampleStream(Tokenizer tokenizer, ObjectStream<String> documents) {
+ super(new DocumentSplitterStream(documents));
+ this.tokenizer = tokenizer;
+ }
+
+ public RawCorefSample read() throws IOException {
+
+ if (documents.isEmpty()) {
+
+ String document = samples.read();
+
+ if (document != null) {
+ new SgmlParser().parse(new StringReader(document),
+ new MucCorefContentHandler(tokenizer, documents));
+ }
+ }
+
+ if (documents.size() > 0) {
+ return documents.remove(0);
+ }
+ else {
+ return null;
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucMentionInserterStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucMentionInserterStream.java
new file mode 100644
index 0000000..95b9905
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/MucMentionInserterStream.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.coref.CorefSample;
+import opennlp.tools.coref.mention.DefaultParse;
+import opennlp.tools.coref.mention.Mention;
+import opennlp.tools.coref.mention.MentionFinder;
+import opennlp.tools.coref.mention.PTBHeadFinder;
+import opennlp.tools.coref.mention.PTBMentionFinder;
+import opennlp.tools.formats.muc.MucCorefContentHandler.CorefMention;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * The mention insert is responsible to insert the mentions from the training data
+ * into the parse trees.
+ */
+public class MucMentionInserterStream extends FilterObjectStream<RawCorefSample, CorefSample> {
+
+ private static Set<String> entitySet = new HashSet<String>(Arrays.asList(DefaultParse.NAME_TYPES));
+
+ private final MentionFinder mentionFinder;
+
+ protected MucMentionInserterStream(ObjectStream<RawCorefSample> samples) {
+ super(samples);
+
+ mentionFinder = PTBMentionFinder.getInstance(PTBHeadFinder.getInstance());
+ }
+
+ private static Span getMinSpan(Parse p, CorefMention mention) {
+ String min = mention.min;
+
+ if (min != null) {
+
+ int startOffset = p.toString().indexOf(min);
+ int endOffset = startOffset + min.length();
+
+ Parse tokens[] = p.getTagNodes();
+
+ int beginToken = -1;
+ int endToken = -1;
+
+ for (int i = 0; i < tokens.length; i++) {
+ if (tokens[i].getSpan().getStart() == startOffset) {
+ beginToken = i;
+ }
+
+ if (tokens[i].getSpan().getEnd() == endOffset) {
+ endToken = i + 1;
+ break;
+ }
+ }
+
+ if (beginToken != -1 && endToken != -1) {
+ return new Span(beginToken, endToken);
+ }
+ }
+
+ return null;
+ }
+
+ public static boolean addMention(int id, Span mention, Parse[] tokens) {
+
+ boolean failed = false;
+
+ Parse startToken = tokens[mention.getStart()];
+ Parse endToken = tokens[mention.getEnd() - 1];
+ Parse commonParent = startToken.getCommonParent(endToken);
+
+ if (commonParent != null) {
+// Span mentionSpan = new Span(startToken.getSpan().getStart(), endToken.getSpan().getEnd());
+
+ if (entitySet.contains(commonParent.getType())) {
+ commonParent.getParent().setType("NP#" + id);
+ }
+ else if (commonParent.getType().equals("NML")) {
+ commonParent.setType("NML#" + id);
+ }
+ else if (commonParent.getType().equals("NP")) {
+ commonParent.setType("NP#" + id);
+ }
+ else {
+ System.out.println("Inserting mention failed: " + commonParent.getType() + " Failed id: " + id);
+ failed = true;
+ }
+ }
+ else {
+ throw new IllegalArgumentException("Tokens must always have a common parent!");
+ }
+
+ return !failed;
+ }
+
+ public CorefSample read() throws IOException {
+
+ RawCorefSample sample = samples.read();
+
+ if (sample != null) {
+
+ List<Parse> mentionParses = new ArrayList<Parse>();
+
+ List<CorefMention[]> allMentions = sample.getMentions();
+ List<Parse> allParses = sample.getParses();
+
+ for (int si = 0; si < allMentions.size(); si++) {
+ CorefMention mentions[] = allMentions.get(si);
+ Parse p = allParses.get(si);
+
+ for (Mention extent : mentionFinder.getMentions(new DefaultParse(p, si))) {
+ if (extent.getParse() == null) {
+ // not sure how to get head index
+ Parse snp = new Parse(p.getText(),extent.getSpan(),"NML",1.0,0);
+ p.insert(snp);
+ }
+ }
+
+ Parse tokens[] = p.getTagNodes();
+
+ for (CorefMention mention : mentions) {
+ Span min = getMinSpan(p, mention);
+
+ if (min == null) {
+ min = mention.span;
+ }
+
+ addMention(mention.id, min, tokens);
+ }
+
+ p.show();
+
+ mentionParses.add(p);
+ }
+
+ return new CorefSample(mentionParses);
+ }
+ else {
+ return null;
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java
new file mode 100644
index 0000000..db350b9
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Adds names to the Coref Sample Stream.
+ */
+public class NameFinderCorefEnhancerStream extends FilterObjectStream<RawCorefSample, RawCorefSample> {
+
+ private TokenNameFinder nameFinders[];
+ private String tags[];
+
+ // TODO: Should be updated to use tag from span instead!
+ protected NameFinderCorefEnhancerStream(TokenNameFinder nameFinders[], String tags[], ObjectStream<RawCorefSample> samples) {
+ super(samples);
+ this.nameFinders = nameFinders;
+ this.tags = tags;
+ }
+
+ public RawCorefSample read() throws IOException {
+
+ RawCorefSample sample = samples.read();
+
+ if (sample != null) {
+
+ for (TokenNameFinder namefinder : nameFinders) {
+ namefinder.clearAdaptiveData();
+ }
+
+ List<Parse> parses = new ArrayList<Parse>();
+
+ for (Parse p : sample.getParses()) {
+
+ Parse parseTokens[] = p.getTagNodes();
+ String tokens[] = new String[parseTokens.length];
+
+ for (int i = 0; i < tokens.length; i++) {
+ tokens[i] = parseTokens[i].toString();
+ }
+
+ for (int i = 0; i < nameFinders.length; i++) {
+ Span names[] = nameFinders[i].find(tokens);
+ Parse.addNames(tags[i], names, parseTokens);
+ }
+
+ parses.add(p);
+ }
+
+ sample.setParses(parses);
+
+ return sample;
+ }
+ else {
+ return null;
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/RawCorefSample.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/RawCorefSample.java
new file mode 100644
index 0000000..d2ae672
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/RawCorefSample.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.formats.muc.MucCorefContentHandler.CorefMention;
+import opennlp.tools.parser.Parse;
+
+/**
+ * A coreference sample as it is extracted from MUC style training data.
+ */
+public class RawCorefSample {
+
+ private List<String[]> texts = new ArrayList<String[]>();
+ private List<CorefMention[]> mentions = new ArrayList<CorefMention[]>();
+
+ private List<Parse> parses;
+
+ RawCorefSample(List<String> texts, List<CorefMention[]> mentions) {
+ }
+
+ public List<String[]> getTexts() {
+ return texts;
+ }
+
+ public List<CorefMention[]> getMentions() {
+ return mentions;
+ }
+
+ void setParses(List<Parse> parses) {
+ this.parses = parses;
+ }
+
+ List<Parse> getParses() {
+ return parses;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/ShallowParseCorefEnhancerStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/ShallowParseCorefEnhancerStream.java
new file mode 100644
index 0000000..05a06f5
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/ShallowParseCorefEnhancerStream.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.chunker.Chunker;
+import opennlp.tools.parser.AbstractBottomUpParser;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ShallowParseCorefEnhancerStream extends FilterObjectStream<RawCorefSample, RawCorefSample> {
+
+ private final POSTagger posTagger;
+ private final Chunker chunker;
+
+ public ShallowParseCorefEnhancerStream(POSTagger posTagger, Chunker chunker, ObjectStream<RawCorefSample> samples) {
+ super(samples);
+ this.posTagger = posTagger;
+ this.chunker = chunker;
+ }
+
+ public RawCorefSample read() throws IOException {
+
+ RawCorefSample sample = samples.read();
+
+ if (sample != null) {
+
+ List<Parse> enhancedParses = new ArrayList<Parse>();
+
+ List<String[]> sentences = sample.getTexts();
+
+ for (String sentence[] : sentences) {
+
+ Parse p = FullParseCorefEnhancerStream.createIncompleteParse(sentence);
+ p.setType(AbstractBottomUpParser.TOP_NODE);
+
+ Parse parseTokens[] = p.getChildren();
+
+ // construct incomplete parse here ..
+ String tags[] = posTagger.tag(sentence);
+
+ for (int i = 0; i < parseTokens.length; i++) {
+ p.insert(new Parse(p.getText(), parseTokens[i].getSpan(), tags[i], 1d, parseTokens[i].getHeadIndex()));
+ }
+
+ // insert tags into incomplete parse
+ Span chunks[] = chunker.chunkAsSpans(sentence, tags);
+
+ for (Span chunk : chunks) {
+ if ("NP".equals(chunk.getType())) {
+ p.insert(new Parse(p.getText(), new Span(0,0), chunk.getType(), 1d, p.getHeadIndex()));
+ }
+ }
+
+ enhancedParses.add(p);
+ }
+
+ sample.setParses(enhancedParses);
+
+ return sample;
+ }
+ else {
+ return null;
+ }
+ }
+}