OPENNLP-575 Copied coref component main code over to sandbox project.
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/AbstractLinker.java b/opennlp-coref/src/main/java/opennlp/tools/coref/AbstractLinker.java
new file mode 100644
index 0000000..184718c
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/AbstractLinker.java
@@ -0,0 +1,276 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.mention.HeadFinder;
+import opennlp.tools.coref.mention.Mention;
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.mention.MentionFinder;
+import opennlp.tools.coref.mention.Parse;
+import opennlp.tools.coref.resolver.AbstractResolver;
+import opennlp.tools.coref.sim.Gender;
+import opennlp.tools.coref.sim.Number;
+
+/**
+ * Provides a default implementation of many of the methods in {@link Linker} that
+ * most implementations of {@link Linker} will want to extend.
+ */
+public abstract class AbstractLinker implements Linker {
+
+ /** The mention finder used to find mentions. */
+ protected MentionFinder mentionFinder;
+
+ /** Specifies whether debug print is generated. */
+ protected boolean debug = true;
+
+ /** The mode in which this linker is running. */
+ protected LinkerMode mode;
+
+ /** Instance used for for returning the same linker for subsequent getInstance requests. */
+ protected static Linker linker;
+
+ /** The resolvers used by this Linker. */
+ protected AbstractResolver[] resolvers;
+ /** The names of the resolvers used by this Linker. */
+ protected String[] resolverNames;
+
+ /** Array used to store the results of each call made to the linker. */
+ protected DiscourseEntity[] entities;
+
+ /** The index of resolver which is used for singular pronouns. */
+ protected int SINGULAR_PRONOUN;
+
+ /** The name of the project where the coreference models are stored. */
+ protected String corefProject;
+
+ /** The head finder used in this linker. */
+ protected HeadFinder headFinder;
+
+ /** Specifies whether coreferent mentions should be combined into a single entity.
+ * Set this to true to combine them, false otherwise. */
+ protected boolean useDiscourseModel;
+
+ /** Specifies whether mentions for which no resolver can be used should be added to the
+ * discourse model.
+ */
+ protected boolean removeUnresolvedMentions;
+
+ /**
+ * Creates a new linker using the models in the specified project directory and using the specified mode.
+ * @param project The location of the models or other data needed by this linker.
+ * @param mode The mode the linker should be run in: testing, training, or evaluation.
+ */
+ public AbstractLinker(String project, LinkerMode mode) {
+ this(project,mode,true);
+ }
+
+ /**
+ * Creates a new linker using the models in the specified project directory, using the specified mode,
+ * and combining coreferent entities based on the specified value.
+ * @param project The location of the models or other data needed by this linker.
+ * @param mode The mode the linker should be run in: testing, training, or evaluation.
+ * @param useDiscourseModel Specifies whether coreferent mention should be combined or not.
+ */
+ public AbstractLinker(String project, LinkerMode mode,boolean useDiscourseModel) {
+ this.corefProject = project;
+ this.mode = mode;
+ SINGULAR_PRONOUN = -1;
+ this.useDiscourseModel = useDiscourseModel;
+ removeUnresolvedMentions = true;
+ }
+
+ /**
+ * Resolves the specified mention to an entity in the specified discourse model or creates a new entity for the mention.
+ * @param mention The mention to resolve.
+ * @param discourseModel The discourse model of existing entities.
+ */
+ protected void resolve(MentionContext mention, DiscourseModel discourseModel) {
+ //System.err.println("AbstractLinker.resolve: "+mode+"("+econtext.id+") "+econtext.toText());
+ boolean validEntity = true; // true if we should add this entity to the dm
+ boolean canResolve = false;
+
+ for (int ri = 0; ri < resolvers.length; ri++) {
+ if (resolvers[ri].canResolve(mention)) {
+ if (mode == LinkerMode.TEST) {
+ entities[ri] = resolvers[ri].resolve(mention, discourseModel);
+ canResolve = true;
+ }
+ else if (mode == LinkerMode.TRAIN) {
+ entities[ri] = resolvers[ri].retain(mention, discourseModel);
+ if (ri+1 != resolvers.length) {
+ canResolve = true;
+ }
+ }
+ else if (mode == LinkerMode.EVAL) {
+ entities[ri] = resolvers[ri].retain(mention, discourseModel);
+ //DiscourseEntity rde = resolvers[ri].resolve(mention, discourseModel);
+ //eval.update(rde == entities[ri], ri, entities[ri], rde);
+ }
+ else {
+ System.err.println("AbstractLinker.Unknown mode: " + mode);
+ }
+ if (ri == SINGULAR_PRONOUN && entities[ri] == null) {
+ validEntity = false;
+ }
+ }
+ else {
+ entities[ri] = null;
+ }
+ }
+ if (!canResolve && removeUnresolvedMentions) {
+ //System.err.println("No resolver for: "+econtext.toText()+ " head="+econtext.headTokenText+" "+econtext.headTokenTag);
+ validEntity = false;
+ }
+ DiscourseEntity de = checkForMerges(discourseModel, entities);
+ if (validEntity) {
+ updateExtent(discourseModel, mention, de,useDiscourseModel);
+ }
+ }
+
+ public HeadFinder getHeadFinder() {
+ return headFinder;
+ }
+
+ /**
+ * Updates the specified discourse model with the specified mention as coreferent with the specified entity.
+ * @param dm The discourse model
+ * @param mention The mention to be added to the specified entity.
+ * @param entity The entity which is mentioned by the specified mention.
+ * @param useDiscourseModel Whether the mentions should be kept as an entiy or simply co-indexed.
+ */
+ protected void updateExtent(DiscourseModel dm, MentionContext mention, DiscourseEntity entity, boolean useDiscourseModel) {
+ if (useDiscourseModel) {
+ if (entity != null) {
+ //System.err.println("AbstractLinker.updateExtent: addingExtent:
+ // "+econtext.toText());
+ if (entity.getGenderProbability() < mention.getGenderProb()) {
+ entity.setGender(mention.getGender());
+ entity.setGenderProbability(mention.getGenderProb());
+ }
+ if (entity.getNumberProbability() < mention.getNumberProb()) {
+ entity.setNumber(mention.getNumber());
+ entity.setNumberProbability(mention.getNumberProb());
+ }
+ entity.addMention(mention);
+ dm.mentionEntity(entity);
+ }
+ else {
+ //System.err.println("AbstractLinker.updateExtent: creatingExtent:
+ // "+econtext.toText()+" "+econtext.gender+" "+econtext.number);
+ entity = new DiscourseEntity(mention, mention.getGender(), mention.getGenderProb(), mention.getNumber(), mention.getNumberProb());
+ dm.addEntity(entity);
+ }
+ }
+ else {
+ if (entity != null) {
+ DiscourseEntity newEntity = new DiscourseEntity(mention, mention.getGender(), mention.getGenderProb(), mention.getNumber(), mention.getNumberProb());
+ dm.addEntity(newEntity);
+ newEntity.setId(entity.getId());
+ }
+ else {
+ DiscourseEntity newEntity = new DiscourseEntity(mention, mention.getGender(), mention.getGenderProb(), mention.getNumber(), mention.getNumberProb());
+ dm.addEntity(newEntity);
+ }
+ }
+ //System.err.println(de1);
+ }
+
+ protected DiscourseEntity checkForMerges(DiscourseModel dm, DiscourseEntity[] des) {
+ DiscourseEntity de1; //tempory variable
+ DiscourseEntity de2; //tempory variable
+ de1 = des[0];
+ for (int di = 1; di < des.length; di++) {
+ de2 = des[di];
+ if (de2 != null) {
+ if (de1 != null && de1 != de2) {
+ dm.mergeEntities(de1, de2, 1);
+ }
+ else {
+ de1 = de2;
+ }
+ }
+ }
+ return (de1);
+ }
+
+ public DiscourseEntity[] getEntities(Mention[] mentions) {
+ MentionContext[] extentContexts = this.constructMentionContexts(mentions);
+ DiscourseModel dm = new DiscourseModel();
+ for (int ei = 0; ei < extentContexts.length; ei++) {
+ //System.err.println(ei+" "+extentContexts[ei].toText());
+ resolve(extentContexts[ei], dm);
+ }
+ return (dm.getEntities());
+ }
+
+ public void setEntities(Mention[] mentions) {
+ getEntities(mentions);
+ }
+
+ public void train() throws IOException {
+ for (int ri = 0; ri < resolvers.length; ri++) {
+ resolvers[ri].train();
+ }
+ }
+
+ public MentionFinder getMentionFinder() {
+ return mentionFinder;
+ }
+
+ public MentionContext[] constructMentionContexts(Mention[] mentions) {
+ int mentionInSentenceIndex=-1;
+ int numMentionsInSentence=-1;
+ int prevSentenceIndex = -1;
+ MentionContext[] contexts = new MentionContext[mentions.length];
+ for (int mi=0,mn=mentions.length;mi<mn;mi++) {
+ Parse mentionParse = mentions[mi].getParse();
+ //System.err.println("AbstractLinker.constructMentionContexts: mentionParse="+mentionParse);
+ if (mentionParse == null) {
+ System.err.println("no parse for "+mentions[mi]);
+ }
+ int sentenceIndex = mentionParse.getSentenceNumber();
+ if (sentenceIndex != prevSentenceIndex) {
+ mentionInSentenceIndex=0;
+ prevSentenceIndex = sentenceIndex;
+ numMentionsInSentence = 0;
+ for (int msi=mi;msi<mentions.length;msi++) {
+ if (sentenceIndex != mentions[msi].getParse().getSentenceNumber()) {
+ break;
+ }
+ numMentionsInSentence++;
+ }
+ }
+ contexts[mi]=new MentionContext(mentions[mi], mentionInSentenceIndex, numMentionsInSentence, mi, sentenceIndex, getHeadFinder());
+ //System.err.println("AbstractLinker.constructMentionContexts: mi="+mi+" sn="+mentionParse.getSentenceNumber()+" extent="+mentions[mi]+" parse="+mentionParse.getSpan()+" mc="+contexts[mi].toText());
+ contexts[mi].setId(mentions[mi].getId());
+ mentionInSentenceIndex++;
+ if (mode != LinkerMode.SIM) {
+ Gender g = computeGender(contexts[mi]);
+ contexts[mi].setGender(g.getType(),g.getConfidence());
+ Number n = computeNumber(contexts[mi]);
+ contexts[mi].setNumber(n.getType(),n.getConfidence());
+ }
+ }
+ return (contexts);
+ }
+
+ protected abstract Gender computeGender(MentionContext mention);
+ protected abstract Number computeNumber(MentionContext mention);
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/CorefModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefModel.java
new file mode 100644
index 0000000..afcf27a
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefModel.java
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.coref;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.zip.GZIPInputStream;
+
+import opennlp.maxent.io.BinaryGISModelReader;
+import opennlp.model.AbstractModel;
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.model.BaseModel;
+
+public class CorefModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "Coref";
+
+ private static final String MALE_NAMES_DICTIONARY_ENTRY_NAME = "maleNames.dictionary";
+
+ private static final String FEMALE_NAMES_DICTIONARY_ENTRY_NAME = "femaleNames.dictionary";
+
+ private static final String NUMBER_MODEL_ENTRY_NAME = "number.model";
+
+// private Map<String, Set<String>> acronyms;
+
+ private static final String COMMON_NOUN_RESOLVER_MODEL_ENTRY_NAME =
+ "commonNounResolver.model";
+
+ private static final String DEFINITE_NOUN_RESOLVER_MODEL_ENTRY_NAME =
+ "definiteNounResolver.model";
+
+ private static final String SPEECH_PRONOUN_RESOLVER_MODEL_ENTRY_NAME =
+ "speechPronounResolver.model";
+
+ // TODO: Add IModel
+
+ private static final String PLURAL_NOUN_RESOLVER_MODEL_ENTRY_NAME =
+ "pluralNounResolver.model";
+
+ private static final String SINGULAR_PRONOUN_RESOLVER_MODEL_ENTRY_NAME =
+ "singularPronounResolver.model";
+
+ private static final String PROPER_NOUN_RESOLVER_MODEL_ENTRY_NAME =
+ "properNounResolver.model";
+
+ private static final String SIM_MODEL_ENTRY_NAME = "sim.model";
+
+ private static final String PLURAL_PRONOUN_RESOLVER_MODEL_ENTRY_NAME =
+ "pluralPronounResolver.model";
+
+ public CorefModel(String languageCode, String project) throws IOException {
+ super(COMPONENT_NAME, languageCode, null);
+
+ artifactMap.put(MALE_NAMES_DICTIONARY_ENTRY_NAME,
+ readNames(project + File.separator + "gen.mas"));
+
+ artifactMap.put(FEMALE_NAMES_DICTIONARY_ENTRY_NAME,
+ readNames(project + File.separator + "gen.fem"));
+
+ // TODO: Create acronyms
+
+ artifactMap.put(NUMBER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "num.bin.gz"));
+
+ artifactMap.put(COMMON_NOUN_RESOLVER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "cmodel.bin.gz"));
+
+ artifactMap.put(DEFINITE_NOUN_RESOLVER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "defmodel.bin.gz"));
+
+
+ artifactMap.put(SPEECH_PRONOUN_RESOLVER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "fmodel.bin.gz"));
+
+ // TODO: IModel
+
+ artifactMap.put(PLURAL_NOUN_RESOLVER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "plmodel.bin.gz"));
+
+ artifactMap.put(SINGULAR_PRONOUN_RESOLVER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "pmodel.bin.gz"));
+
+ artifactMap.put(PROPER_NOUN_RESOLVER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "pnmodel.bin.gz"));
+
+ artifactMap.put(SIM_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "sim.bin.gz"));
+
+ artifactMap.put(PLURAL_PRONOUN_RESOLVER_MODEL_ENTRY_NAME,
+ createModel(project + File.separator + "tmodel.bin.gz"));
+
+ checkArtifactMap();
+ }
+
+ private AbstractModel createModel(String fileName) throws IOException {
+ return new BinaryGISModelReader(new DataInputStream(new GZIPInputStream(
+ new FileInputStream(fileName)))).getModel();
+ }
+
+ private static Dictionary readNames(String nameFile) throws IOException {
+ Dictionary names = new Dictionary();
+
+ BufferedReader nameReader = new BufferedReader(new FileReader(nameFile));
+ for (String line = nameReader.readLine(); line != null; line = nameReader.readLine()) {
+ names.put(new StringList(line));
+ }
+
+ return names;
+ }
+
+ public Dictionary getMaleNames() {
+ return (Dictionary) artifactMap.get(MALE_NAMES_DICTIONARY_ENTRY_NAME);
+ }
+
+ public Dictionary getFemaleNames() {
+ return (Dictionary) artifactMap.get(FEMALE_NAMES_DICTIONARY_ENTRY_NAME);
+ }
+
+ public AbstractModel getNumberModel() {
+ return (AbstractModel) artifactMap.get(NUMBER_MODEL_ENTRY_NAME);
+ }
+
+// public AcronymDictionary getAcronyms() {
+// return null;
+// }
+
+ public AbstractModel getCommonNounResolverModel() {
+ return (AbstractModel) artifactMap.get(COMMON_NOUN_RESOLVER_MODEL_ENTRY_NAME);
+ }
+
+ public AbstractModel getDefiniteNounResolverModel() {
+ return (AbstractModel) artifactMap.get(DEFINITE_NOUN_RESOLVER_MODEL_ENTRY_NAME);
+ }
+
+ public AbstractModel getSpeechPronounResolverModel() {
+ return (AbstractModel) artifactMap.get(SPEECH_PRONOUN_RESOLVER_MODEL_ENTRY_NAME);
+ }
+
+ // TODO: Where is this model used ?
+// public AbstractModel getIModel() {
+// return null;
+// }
+
+ public AbstractModel getPluralNounResolverModel() {
+ return (AbstractModel) artifactMap.get(PLURAL_NOUN_RESOLVER_MODEL_ENTRY_NAME);
+ }
+
+ public AbstractModel getSingularPronounResolverModel() {
+ return (AbstractModel) artifactMap.get(SINGULAR_PRONOUN_RESOLVER_MODEL_ENTRY_NAME);
+ }
+
+ public AbstractModel getProperNounResolverModel() {
+ return (AbstractModel) artifactMap.get(PROPER_NOUN_RESOLVER_MODEL_ENTRY_NAME);
+ }
+
+ public AbstractModel getSimModel() {
+ return (AbstractModel) artifactMap.get(SIM_MODEL_ENTRY_NAME);
+ }
+
+ public AbstractModel getPluralPronounResolverModel() {
+ return (AbstractModel) artifactMap.get(PLURAL_PRONOUN_RESOLVER_MODEL_ENTRY_NAME);
+ }
+
+ public static void main(String[] args) throws IOException {
+
+ if (args.length != 1) {
+ System.err.println("Usage: CorefModel projectDirectory");
+ System.exit(-1);
+ }
+
+ String projectDirectory = args[0];
+
+ CorefModel model = new CorefModel("en", projectDirectory);
+ model.serialize(new FileOutputStream("coref.model"));
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/CorefSample.java b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefSample.java
new file mode 100644
index 0000000..05ee08f
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefSample.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.coref.mention.DefaultParse;
+import opennlp.tools.parser.Parse;
+
+public class CorefSample {
+
+ private List<Parse> parses;
+
+ public CorefSample(List<Parse> parses) {
+ this.parses = parses;
+ }
+
+ public List<opennlp.tools.coref.mention.Parse> getParses() {
+
+ List<opennlp.tools.coref.mention.Parse> corefParses =
+ new ArrayList<opennlp.tools.coref.mention.Parse>();
+
+ int sentNumber = 0;
+ for (Parse parse : parses) {
+ corefParses.add(new DefaultParse(parse, sentNumber++));
+ }
+
+ return corefParses;
+ }
+
+ @Override
+ public String toString() {
+
+ StringBuffer sb = new StringBuffer();
+
+ for (Parse parse : parses) {
+ parse.show(sb);
+ sb.append('\n');
+ }
+
+ sb.append('\n');
+
+ return sb.toString();
+ }
+
+ public static CorefSample parse(String corefSampleString) {
+
+ List<Parse> parses = new ArrayList<Parse>();
+
+ for (String line : corefSampleString.split("\\r?\\n")) {
+ parses.add(Parse.parseParse(line));
+ }
+
+ return new CorefSample(parses);
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/CorefSampleDataStream.java b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefSampleDataStream.java
new file mode 100644
index 0000000..404c48f
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefSampleDataStream.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class CorefSampleDataStream extends FilterObjectStream<String, CorefSample> {
+
+ public CorefSampleDataStream(ObjectStream<String> in) {
+ super(in);
+ }
+
+ public CorefSample read() throws IOException {
+
+ String document = samples.read();
+
+ if (document != null) {
+ return CorefSample.parse(document);
+ }
+ else {
+ return null;
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/CorefTrainer.java b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefTrainer.java
new file mode 100644
index 0000000..9d6ec8c
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/CorefTrainer.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Stack;
+
+import opennlp.tools.coref.mention.DefaultParse;
+import opennlp.tools.coref.mention.Mention;
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.mention.MentionFinder;
+import opennlp.tools.coref.resolver.MaxentResolver;
+import opennlp.tools.coref.sim.GenderModel;
+import opennlp.tools.coref.sim.NumberModel;
+import opennlp.tools.coref.sim.SimilarityModel;
+import opennlp.tools.coref.sim.TrainSimilarityModel;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.ObjectStream;
+
+public class CorefTrainer {
+
+ private static boolean containsToken(String token, Parse p) {
+ for (Parse node : p.getTagNodes()) {
+ if (node.getCoveredText().equals(token))
+ return true;
+ }
+ return false;
+ }
+
+ private static Mention[] getMentions(CorefSample sample, MentionFinder mentionFinder) {
+
+ List<Mention> mentions = new ArrayList<Mention>();
+
+ for (opennlp.tools.coref.mention.Parse corefParse : sample.getParses()) {
+
+ Parse p = ((DefaultParse) corefParse).getParse();
+
+ Mention extents[] = mentionFinder.getMentions(corefParse);
+
+ for (int ei = 0, en = extents.length; ei < en;ei++) {
+
+ if (extents[ei].getParse() == null) {
+
+ Stack<Parse> nodes = new Stack<Parse>();
+ nodes.add(p);
+
+ while (!nodes.isEmpty()) {
+
+ Parse node = nodes.pop();
+
+ if (node.getSpan().equals(extents[ei].getSpan()) && node.getType().startsWith("NML")) {
+ DefaultParse corefParseNode = new DefaultParse(node, corefParse.getSentenceNumber());
+ extents[ei].setParse(corefParseNode);
+ extents[ei].setId(corefParseNode.getEntityId());
+ break;
+ }
+
+ nodes.addAll(Arrays.asList(node.getChildren()));
+ }
+ }
+ }
+
+ mentions.addAll(Arrays.asList(extents));
+ }
+
+ return mentions.toArray(new Mention[mentions.size()]);
+ }
+
+ public static void train(String modelDirectory, ObjectStream<CorefSample> samples,
+ boolean useTreebank, boolean useDiscourseModel) throws IOException {
+
+ TrainSimilarityModel simTrain = SimilarityModel.trainModel(modelDirectory + "/coref/sim");
+ TrainSimilarityModel genTrain = GenderModel.trainModel(modelDirectory + "/coref/gen");
+ TrainSimilarityModel numTrain = NumberModel.trainModel(modelDirectory + "/coref/num");
+
+ useTreebank = true;
+
+ Linker simLinker;
+
+ if (useTreebank) {
+ simLinker = new TreebankLinker(modelDirectory + "/coref/", LinkerMode.SIM);
+ }
+ else {
+ simLinker = new DefaultLinker(modelDirectory + "/coref/" ,LinkerMode.SIM);
+ }
+
+ // TODO: Feed with training data ...
+ for (CorefSample sample = samples.read(); sample != null; sample = samples.read()) {
+
+ Mention[] mentions = getMentions(sample, simLinker.getMentionFinder());
+ MentionContext[] extentContexts = simLinker.constructMentionContexts(mentions);
+
+ simTrain.setExtents(extentContexts);
+ genTrain.setExtents(extentContexts);
+ numTrain.setExtents(extentContexts);
+ }
+
+ simTrain.trainModel();
+ genTrain.trainModel();
+ numTrain.trainModel();
+
+ MaxentResolver.setSimilarityModel(SimilarityModel.testModel(modelDirectory + "/coref"+"/sim"));
+
+ // Done with similarity training
+
+ // Now train the linkers
+
+ // Training data needs to be read in again and the stream must be reset
+ samples.reset();
+
+ // Now train linkers
+ Linker trainLinker;
+ if (useTreebank) {
+ trainLinker = new TreebankLinker(modelDirectory + "/coref/", LinkerMode.TRAIN, useDiscourseModel);
+ }
+ else {
+ trainLinker = new DefaultLinker(modelDirectory + "/coref/", LinkerMode.TRAIN, useDiscourseModel);
+ }
+
+ for (CorefSample sample = samples.read(); sample != null; sample = samples.read()) {
+
+ Mention[] mentions = getMentions(sample, trainLinker.getMentionFinder());
+ trainLinker.setEntities(mentions);
+ }
+
+ trainLinker.train();
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/DefaultLinker.java b/opennlp-coref/src/main/java/opennlp/tools/coref/DefaultLinker.java
new file mode 100644
index 0000000..74ebbfc
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/DefaultLinker.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.mention.PTBHeadFinder;
+import opennlp.tools.coref.mention.ShallowParseMentionFinder;
+import opennlp.tools.coref.resolver.AbstractResolver;
+import opennlp.tools.coref.resolver.CommonNounResolver;
+import opennlp.tools.coref.resolver.DefiniteNounResolver;
+import opennlp.tools.coref.resolver.FixedNonReferentialResolver;
+import opennlp.tools.coref.resolver.IsAResolver;
+import opennlp.tools.coref.resolver.MaxentResolver;
+import opennlp.tools.coref.resolver.NonReferentialResolver;
+import opennlp.tools.coref.resolver.PerfectResolver;
+import opennlp.tools.coref.resolver.PluralNounResolver;
+import opennlp.tools.coref.resolver.PluralPronounResolver;
+import opennlp.tools.coref.resolver.ProperNounResolver;
+import opennlp.tools.coref.resolver.ResolverMode;
+import opennlp.tools.coref.resolver.SingularPronounResolver;
+import opennlp.tools.coref.resolver.SpeechPronounResolver;
+import opennlp.tools.coref.sim.Gender;
+import opennlp.tools.coref.sim.MaxentCompatibilityModel;
+import opennlp.tools.coref.sim.Number;
+import opennlp.tools.coref.sim.SimilarityModel;
+
+/**
+ * This class perform coreference for treebank style parses or for noun-phrase chunked data.
+ * Non-constituent entities such as pre-nominal named-entities and sub entities in simple coordinated
+ * noun phases will be created. This linker requires that named-entity information also be provided.
+ * This information can be added to the parse using the -parse option with EnglishNameFinder.
+ */
+public class DefaultLinker extends AbstractLinker {
+
+ protected MaxentCompatibilityModel mcm;
+
+ /**
+ * Creates a new linker with the specified model directory, running in the specified mode.
+ * @param modelDirectory The directory where the models for this linker are kept.
+ * @param mode The mode that this linker is running in.
+ * @throws IOException when the models can not be read or written to based on the mode.
+ */
+ public DefaultLinker(String modelDirectory, LinkerMode mode) throws IOException {
+ this(modelDirectory,mode,true,-1);
+ }
+
+ /**
+ * Creates a new linker with the specified model directory, running in the specified mode which uses a discourse model
+ * based on the specified parameter.
+ * @param modelDirectory The directory where the models for this linker are kept.
+ * @param mode The mode that this linker is running in.
+ * @param useDiscourseModel Whether the model should use a discourse model or not.
+ * @throws IOException when the models can not be read or written to based on the mode.
+ */
+ public DefaultLinker(String modelDirectory, LinkerMode mode, boolean useDiscourseModel) throws IOException {
+ this(modelDirectory,mode,useDiscourseModel,-1);
+ }
+
+ /**
+ * Creates a new linker with the specified model directory, running in the specified mode which uses a discourse model
+ * based on the specified parameter and uses the specified fixed non-referential probability.
+ * @param modelDirectory The directory where the models for this linker are kept.
+ * @param mode The mode that this linker is running in.
+ * @param useDiscourseModel Whether the model should use a discourse model or not.
+ * @param fixedNonReferentialProbability The probability which resolvers are required to exceed to positi a coreference relationship.
+ * @throws IOException when the models can not be read or written to based on the mode.
+ */
+ public DefaultLinker(String modelDirectory, LinkerMode mode, boolean useDiscourseModel, double fixedNonReferentialProbability) throws IOException {
+ super(modelDirectory, mode, useDiscourseModel);
+ if (mode != LinkerMode.SIM) {
+ mcm = new MaxentCompatibilityModel(corefProject);
+ }
+ initHeadFinder();
+ initMentionFinder();
+ if (mode != LinkerMode.SIM) {
+ initResolvers(mode, fixedNonReferentialProbability);
+ entities = new DiscourseEntity[resolvers.length];
+ }
+ }
+
+ /**
+ * Initializes the resolvers used by this linker.
+ * @param mode The mode in which this linker is being used.
+ * @param fixedNonReferentialProbability
+ * @throws IOException
+ */
+ protected void initResolvers(LinkerMode mode, double fixedNonReferentialProbability) throws IOException {
+ if (mode == LinkerMode.TRAIN) {
+ mentionFinder.setPrenominalNamedEntityCollection(false);
+ mentionFinder.setCoordinatedNounPhraseCollection(false);
+ }
+ SINGULAR_PRONOUN = 0;
+ if (LinkerMode.TEST == mode || LinkerMode.EVAL == mode) {
+ if (fixedNonReferentialProbability < 0) {
+ resolvers = new MaxentResolver[] {
+ new SingularPronounResolver(corefProject, ResolverMode.TEST),
+ new ProperNounResolver(corefProject, ResolverMode.TEST),
+ new DefiniteNounResolver(corefProject, ResolverMode.TEST),
+ new IsAResolver(corefProject, ResolverMode.TEST),
+ new PluralPronounResolver(corefProject, ResolverMode.TEST),
+ new PluralNounResolver(corefProject, ResolverMode.TEST),
+ new CommonNounResolver(corefProject, ResolverMode.TEST),
+ new SpeechPronounResolver(corefProject, ResolverMode.TEST)
+ };
+ }
+ else {
+ NonReferentialResolver nrr = new FixedNonReferentialResolver(fixedNonReferentialProbability);
+ resolvers = new MaxentResolver[] {
+ new SingularPronounResolver(corefProject, ResolverMode.TEST,nrr),
+ new ProperNounResolver(corefProject, ResolverMode.TEST,nrr),
+ new DefiniteNounResolver(corefProject, ResolverMode.TEST,nrr),
+ new IsAResolver(corefProject, ResolverMode.TEST,nrr),
+ new PluralPronounResolver(corefProject, ResolverMode.TEST,nrr),
+ new PluralNounResolver(corefProject, ResolverMode.TEST,nrr),
+ new CommonNounResolver(corefProject, ResolverMode.TEST,nrr),
+ new SpeechPronounResolver(corefProject, ResolverMode.TEST,nrr)
+ };
+ }
+ if (LinkerMode.EVAL == mode) {
+ //String[] names = {"Pronoun", "Proper", "Def-NP", "Is-a", "Plural Pronoun"};
+ //eval = new Evaluation(names);
+ }
+ MaxentResolver.setSimilarityModel(SimilarityModel.testModel(corefProject + "/sim"));
+ }
+ else if (LinkerMode.TRAIN == mode) {
+ resolvers = new AbstractResolver[9];
+ resolvers[0] = new SingularPronounResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[1] = new ProperNounResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[2] = new DefiniteNounResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[3] = new IsAResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[4] = new PluralPronounResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[5] = new PluralNounResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[6] = new CommonNounResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[7] = new SpeechPronounResolver(corefProject, ResolverMode.TRAIN);
+ resolvers[8] = new PerfectResolver();
+ }
+ else {
+ System.err.println("DefaultLinker: Invalid Mode");
+ }
+ }
+
+ /**
+ * Initializes the head finder for this linker.
+ */
+ protected void initHeadFinder() {
+ headFinder = PTBHeadFinder.getInstance();
+ }
+ /**
+ * Initializes the mention finder for this linker.
+ * This can be over-ridden to change the space of mentions used for coreference.
+ */
+ protected void initMentionFinder() {
+ mentionFinder = ShallowParseMentionFinder.getInstance(headFinder);
+ }
+
+ @Override
+ protected Gender computeGender(MentionContext mention) {
+ return mcm.computeGender(mention);
+ }
+
+ @Override
+ protected Number computeNumber(MentionContext mention) {
+ return mcm.computeNumber(mention);
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseElement.java b/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseElement.java
new file mode 100644
index 0000000..9336fad
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseElement.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.util.ReverseListIterator;
+
+/**
+ * Represents an item in which can be put into the discourse model. Object which are
+ * to be placed in the discourse model should extend this class.
+ *
+ * @see opennlp.tools.coref.DiscourseModel
+ */
+public abstract class DiscourseElement {
+
+ private List<MentionContext> extents;
+ private int id=-1;
+ private MentionContext lastExtent;
+
+ /**
+ * Creates a new discourse element which contains the specified mention.
+ *
+ * @param mention The mention which begins this discourse element.
+ */
+ public DiscourseElement(MentionContext mention) {
+ extents = new ArrayList<MentionContext>(1);
+ lastExtent = mention;
+ extents.add(mention);
+ }
+
+ /**
+ * Returns an iterator over the mentions which iterates through them based on which were most recently mentioned.
+ * @return the {@link Iterator}.
+ */
+ public Iterator<MentionContext> getRecentMentions() {
+ return(new ReverseListIterator<MentionContext>(extents));
+ }
+
+ /**
+ * Returns an iterator over the mentions which iterates through them based on
+ * their occurrence in the document.
+ *
+ * @return the {@link Iterator}
+ */
+ public Iterator<MentionContext> getMentions() {
+ return(extents.listIterator());
+ }
+
+ /**
+ * Returns the number of mentions in this element.
+ *
+ * @return number of mentions
+ */
+ public int getNumMentions() {
+ return(extents.size());
+ }
+
+ /**
+ * Adds the specified mention to this discourse element.
+ * @param mention The mention to be added.
+ */
+ public void addMention(MentionContext mention) {
+ extents.add(mention);
+ lastExtent=mention;
+ }
+
+ /**
+ * Returns the last mention for this element. For appositives this will be the
+ * first part of the appositive.
+ * @return the last mention for this element.
+ */
+ public MentionContext getLastExtent() {
+ return(lastExtent);
+ }
+
+ /**
+ * Associates an id with this element.
+ * @param id The id.
+ */
+ public void setId(int id) {
+ this.id=id;
+ }
+
+ /**
+ * Returns the id associated with this element.
+ *
+ * @return the id associated with this element.
+ */
+ public int getId() {
+ return(id);
+ }
+
+ @Override
+ public String toString() {
+ Iterator<MentionContext> ei = extents.iterator();
+ MentionContext ex = ei.next();
+ StringBuilder de = new StringBuilder();
+ de.append("[ ").append(ex.toText());//.append("<").append(ex.getHeadText()).append(">");
+ while (ei.hasNext()) {
+ ex = ei.next();
+ de.append(", ").append(ex.toText());//.append("<").append(ex.getHeadText()).append(">");
+ }
+ de.append(" ]");
+ return(de.toString());
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseEntity.java b/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseEntity.java
new file mode 100644
index 0000000..f92a883
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseEntity.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.sim.GenderEnum;
+import opennlp.tools.coref.sim.NumberEnum;
+
+/**
+ * Represents an entity in a discourse model.
+ */
+public class DiscourseEntity extends DiscourseElement {
+
+ private String category = null;
+ private GenderEnum gender;
+ private double genderProb;
+ private NumberEnum number;
+ private double numberProb;
+
+ /**
+ * Creates a new entity based on the specified mention and its specified gender and number properties.
+ *
+ * @param mention The first mention of this entity.
+ * @param gender The gender of this entity.
+ * @param genderProb The probability that the specified gender is correct.
+ * @param number The number for this entity.
+ * @param numberProb The probability that the specified number is correct.
+ */
+ public DiscourseEntity(MentionContext mention, GenderEnum gender, double genderProb, NumberEnum number, double numberProb) {
+ super(mention);
+ this.gender = gender;
+ this.genderProb = genderProb;
+ this.number = number;
+ this.numberProb = numberProb;
+ }
+
+ /**
+ * Creates a new entity based on the specified mention.
+ *
+ * @param mention The first mention of this entity.
+ */
+ public DiscourseEntity(MentionContext mention) {
+ super(mention);
+ gender = GenderEnum.UNKNOWN;
+ number = NumberEnum.UNKNOWN;
+ }
+
+ /**
+ * Returns the semantic category of this entity.
+ * This field is used to associated named-entity categories with an entity.
+ *
+ * @return the semantic category of this entity.
+ */
+ public String getCategory() {
+ return (category);
+ }
+
+ /**
+ * Specifies the semantic category of this entity.
+ *
+ * @param cat The semantic category of the entity.
+ */
+ public void setCategory(String cat) {
+ category = cat;
+ }
+
+ /**
+ * Returns the gender associated with this entity.
+ *
+ * @return the gender associated with this entity.
+ */
+ public GenderEnum getGender() {
+ return gender;
+ }
+
+ /**
+ * Returns the probability for the gender associated with this entity.
+ *
+ * @return the probability for the gender associated with this entity.
+ */
+ public double getGenderProbability() {
+ return genderProb;
+ }
+
+ /**
+ * Returns the number associated with this entity.
+ *
+ * @return the number associated with this entity.
+ */
+ public NumberEnum getNumber() {
+ return number;
+ }
+
+ /**
+ * Returns the probability for the number associated with this entity.
+ *
+ * @return the probability for the number associated with this entity.
+ */
+ public double getNumberProbability() {
+ return numberProb;
+ }
+
+ /**
+ * Specifies the gender of this entity.
+ *
+ * @param gender The gender.
+ */
+ public void setGender(GenderEnum gender) {
+ this.gender = gender;
+ }
+
+ /**
+ * Specifies the probability of the gender of this entity.
+ *
+ * @param p the probability of the gender of this entity.
+ */
+ public void setGenderProbability(double p) {
+ genderProb = p;
+ }
+
+ /**
+ * Specifies the number of this entity.
+ *
+ * @param number
+ */
+ public void setNumber(NumberEnum number) {
+ this.number = number;
+ }
+
+ /**
+ * Specifies the probability of the number of this entity.
+ *
+ * @param p the probability of the number of this entity.
+ */
+ public void setNumberProbability(double p) {
+ numberProb = p;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseModel.java
new file mode 100644
index 0000000..f0552a7
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/DiscourseModel.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Represents the elements which are part of a discourse.
+ */
+public class DiscourseModel {
+
+ private List<DiscourseEntity> entities;
+
+ int nextEntityId = 1;
+
+ /**
+ * Creates a new discourse model.
+ */
+ public DiscourseModel() {
+ entities = new ArrayList<DiscourseEntity>();
+ }
+
+ /**
+ * Indicates that the specified entity has been mentioned.
+ *
+ * @param e The entity which has been mentioned.
+ */
+ public void mentionEntity(DiscourseEntity e) {
+ if (entities.remove(e)) {
+ entities.add(0,e);
+ }
+ else {
+ System.err.println("DiscourseModel.mentionEntity: failed to remove "+e);
+ }
+ }
+
+ /**
+ * Returns the number of entities in this discourse model.
+ *
+ * @return the number of entities in this discourse model.
+ */
+ public int getNumEntities() {
+ return entities.size();
+ }
+
+ /**
+ * Returns the entity at the specified index.
+ *
+ * @param i The index of the entity to be returned.
+ * @return the entity at the specified index.
+ */
+ public DiscourseEntity getEntity(int i) {
+ return entities.get(i);
+ }
+
+ /**
+ * Adds the specified entity to this discourse model.
+ *
+ * @param e the entity to be added to the model.
+ */
+ public void addEntity(DiscourseEntity e) {
+ e.setId(nextEntityId);
+ nextEntityId++;
+ entities.add(0,e);
+ }
+
+ /**
+ * Merges the specified entities into a single entity with the specified confidence.
+ *
+ * @param e1 The first entity.
+ * @param e2 The second entity.
+ * @param confidence The confidence.
+ */
+ public void mergeEntities(DiscourseEntity e1,DiscourseEntity e2,float confidence) {
+ for (Iterator<MentionContext> ei=e2.getMentions();ei.hasNext();) {
+ e1.addMention(ei.next());
+ }
+ //System.err.println("DiscourseModel.mergeEntities: removing "+e2);
+ entities.remove(e2);
+ }
+
+ /**
+ * Returns the entities in the discourse model.
+ *
+ * @return the entities in the discourse model.
+ */
+ public DiscourseEntity[] getEntities() {
+ DiscourseEntity[] des = new DiscourseEntity[entities.size()];
+ entities.toArray(des);
+ return des;
+ }
+
+ /**
+ * Removes all elements from this discourse model.
+ */
+ public void clear() {
+ entities.clear();
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/Linker.java b/opennlp-coref/src/main/java/opennlp/tools/coref/Linker.java
new file mode 100644
index 0000000..8e0c249
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/Linker.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.mention.HeadFinder;
+import opennlp.tools.coref.mention.Mention;
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.mention.MentionFinder;
+
+/**
+ * A linker provides an interface for finding mentions, {@link #getMentionFinder getMentionFinder},
+ * and creating entities out of those mentions, {@link #getEntities getEntities}. This interface also allows
+ * for the training of a resolver with the method {@link #setEntities setEntitites} which is used to give the
+ * resolver mentions whose entityId fields indicate which mentions refer to the same entity and the
+ * {@link #train train} method which compiles all the information provided via calls to
+ * {@link #setEntities setEntities} into a model.
+ */
+public interface Linker {
+
+
+ /**
+ * String constant used to label a mention which is a description.
+ */
+ public static final String DESCRIPTOR = "desc";
+
+ /**
+ * String constant used to label an mention in an appositive relationship.
+ */
+ public static final String ISA = "isa";
+
+ /**
+ * String constant used to label a mention which consists of two or more noun phrases.
+ */
+ public static final String COMBINED_NPS = "cmbnd";
+
+ /**
+ * String constant used to label a mention which consists of a single noun phrase.
+ */
+ public static final String NP = "np";
+
+ /**
+ * String constant used to label a mention which is a proper noun modifying another noun.
+ */
+ public static final String PROPER_NOUN_MODIFIER = "pnmod";
+
+ /**
+ * String constant used to label a mention which is a pronoun.
+ */
+ public static final String PRONOUN_MODIFIER = "np";
+
+
+ /**
+ * Indicated that the specified mentions can be used to train this linker.
+ * This requires that the coreference relationship between the mentions have been labeled
+ * in the mention's id field.
+ *
+ * @param mentions The mentions to be used to train the linker.
+ */
+ public void setEntities(Mention[] mentions);
+
+ /** Returns a list of entities which group the mentions into entity classes.
+ * @param mentions A array of mentions.
+ *
+ * @return An array of discourse entities.
+ */
+ public DiscourseEntity[] getEntities(Mention[] mentions);
+
+ /**
+ * Creates mention contexts for the specified mention exents. These are used to compute coreference features over.
+ * @param mentions The mention of a document.
+ *
+ * @return mention contexts for the specified mention exents.
+ */
+ public MentionContext[] constructMentionContexts(Mention[] mentions);
+
+ /**
+ * Trains the linker based on the data specified via calls to {@link #setEntities setEntities}.
+ *
+ * @throws IOException
+ */
+ public void train() throws IOException;
+
+ /**
+ * Returns the mention finder for this linker. This can be used to get the mentions of a Parse.
+ *
+ * @return The object which finds mentions for this linker.
+ */
+ public MentionFinder getMentionFinder();
+
+ /**
+ * Returns the head finder associated with this linker.
+ *
+ * @return The head finder associated with this linker.
+ */
+ public HeadFinder getHeadFinder();
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/LinkerMode.java b/opennlp-coref/src/main/java/opennlp/tools/coref/LinkerMode.java
new file mode 100644
index 0000000..654db5a
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/LinkerMode.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.coref;
+
+/**
+ * Enumeration of modes in which a linker can run.
+ */
+public enum LinkerMode {
+
+ /**
+ * Testing mode, used to identify coreference relationships in un-annotated text.
+ */
+ TEST,
+
+ /**
+ * Training mode, used to learn coreference relationships in annotated text.
+ */
+ TRAIN,
+
+ /** Evaluation mode, used to evaluate identifed coreference relationships based on annotated text. */
+ EVAL,
+
+ /**
+ * Training mode, used to learn coreference relationships in annotated text.
+ */
+ SIM
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/TreebankLinker.java b/opennlp-coref/src/main/java/opennlp/tools/coref/TreebankLinker.java
new file mode 100644
index 0000000..db265e7
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/TreebankLinker.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.mention.PTBMentionFinder;
+
+/**
+ * This class perform coreference for treebank style parses.
+ * <p>
+ * It will only perform coreference over constituents defined in the trees and
+ * will not generate new constituents for pre-nominal entities or sub-entities in
+ * simple coordinated noun phrases.
+ * <p>
+ * This linker requires that named-entity information also be provided.
+ */
+public class TreebankLinker extends DefaultLinker {
+
+ public TreebankLinker(String project, LinkerMode mode) throws IOException {
+ super(project,mode);
+ }
+
+ public TreebankLinker(String project, LinkerMode mode, boolean useDiscourseModel) throws IOException {
+ super(project,mode,useDiscourseModel);
+ }
+
+ public TreebankLinker(String project, LinkerMode mode, boolean useDiscourseModel, double fixedNonReferentialProbability) throws IOException {
+ super(project,mode,useDiscourseModel,fixedNonReferentialProbability);
+ }
+
+ @Override
+ protected void initMentionFinder() {
+ mentionFinder = PTBMentionFinder.getInstance(headFinder);
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/AbstractMentionFinder.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/AbstractMentionFinder.java
new file mode 100644
index 0000000..4bf28a2
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/AbstractMentionFinder.java
@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.coref.mention;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.coref.Linker;
+import opennlp.tools.coref.resolver.ResolverUtils;
+import opennlp.tools.util.Span;
+
+/**
+ * Provides default implementation of many of the methods in the {@link MentionFinder} interface.
+ */
+public abstract class AbstractMentionFinder implements MentionFinder {
+
+ protected HeadFinder headFinder;
+
+ protected boolean collectPrenominalNamedEntities;
+ protected boolean collectCoordinatedNounPhrases;
+
+ private void gatherHeads(Parse p, Map<Parse, Parse> heads) {
+ Parse head = headFinder.getHead(p);
+ //System.err.println("AbstractMention.gatherHeads: "+head+" -> ("+p.hashCode()+") "+p);
+ //if (head != null) { System.err.println("head.hashCode()="+head.hashCode());}
+ if (head != null) {
+ heads.put(head, p);
+ }
+ }
+
+ /** Assigns head relations between noun phrases and the child np
+ * which is their head.
+ * @param nps List of valid nps for this mention finder.
+ * @return mapping from noun phrases and the child np which is their head
+ **/
+ protected Map<Parse, Parse> constructHeadMap(List<Parse> nps) {
+ Map<Parse, Parse> headMap = new HashMap<Parse, Parse>();
+ for (int ni = 0; ni < nps.size(); ni++) {
+ Parse np = nps.get(ni);
+ gatherHeads(np, headMap);
+ }
+ return headMap;
+ }
+
+ public boolean isPrenominalNamedEntityCollection() {
+ return collectPrenominalNamedEntities;
+ }
+
+ public void setPrenominalNamedEntityCollection(boolean b) {
+ collectPrenominalNamedEntities = b;
+ }
+
+ protected boolean isBasalNounPhrase(Parse np) {
+ return np.getNounPhrases().size() == 0;
+ }
+
+ protected boolean isPossessive(Parse np) {
+ List<Parse> parts = np.getSyntacticChildren();
+ if (parts.size() > 1) {
+ Parse child0 = parts.get(0);
+ if (child0.isNounPhrase()) {
+ List<Parse> ctoks = child0.getTokens();
+ Parse tok = ctoks.get(ctoks.size() - 1);
+ if (tok.getSyntacticType().equals("POS")) {
+ return true;
+ }
+ }
+ }
+ if (parts.size() > 2) {
+ Parse child0 = parts.get(0);
+ Parse child1 = parts.get(1);
+ Parse child2 = parts.get(2);
+ if (child1.isToken() && child1.getSyntacticType().equals("POS") && child0.isNounPhrase() && child2.isNounPhrase()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ protected boolean isOfPrepPhrase(Parse np) {
+ List<Parse> parts = np.getSyntacticChildren();
+ if (parts.size() == 2) {
+ Parse child0 = parts.get(0);
+ if (child0.isNounPhrase()) {
+ Parse child1 = parts.get(1);
+ List<Parse> cparts = child1.getSyntacticChildren();
+ if (cparts.size() == 2) {
+ Parse child2 = cparts.get(0);
+ if (child2.isToken() && child2.toString().equals("of")) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ protected boolean isConjoinedBasal(Parse np) {
+ List<Parse> parts = np.getSyntacticChildren();
+ boolean allToken = true;
+ boolean hasConjunction = false;
+ for (int ti = 0; ti < parts.size(); ti++) {
+ Parse c = parts.get(ti);
+ if (c.isToken()) {
+ if (c.getSyntacticType().equals("CC")) {
+ hasConjunction = true;
+ }
+ }
+ else {
+ allToken = false;
+ break;
+ }
+ }
+ return allToken && hasConjunction;
+ }
+
+ private void collectCoordinatedNounPhraseMentions(Parse np, List<Mention> entities) {
+ //System.err.println("collectCoordNp: "+np);
+ //exclude nps with UCPs inside.
+ List<Parse> sc = np.getSyntacticChildren();
+ for (Iterator<Parse> sci = sc.iterator();sci.hasNext();) {
+ Parse scp = sci.next();
+ if (scp.getSyntacticType().equals("UCP") || scp.getSyntacticType().equals("NX")) {
+ return;
+ }
+ }
+ List<Parse> npTokens = np.getTokens();
+ boolean inCoordinatedNounPhrase = false;
+ int lastNpTokenIndex = headFinder.getHeadIndex(np);
+ for (int ti = lastNpTokenIndex - 1; ti >= 0; ti--) {
+ Parse tok = npTokens.get(ti);
+ String tokStr = tok.toString();
+ if ((tokStr.equals("and") || tokStr.equals("or")) && !isPartOfName(tok)) {
+ if (lastNpTokenIndex != ti) {
+ if (ti - 1 >= 0 && (npTokens.get(ti - 1)).getSyntacticType().startsWith("NN")) {
+ Span npSpan = new Span((npTokens.get(ti + 1)).getSpan().getStart(), (npTokens.get(lastNpTokenIndex)).getSpan().getEnd());
+ Mention snpExtent = new Mention(npSpan, npSpan, tok.getEntityId(), null,"CNP");
+ entities.add(snpExtent);
+ //System.err.println("adding extent for conjunction in: "+np+" preeceeded by "+((Parse) npTokens.get(ti-1)).getSyntacticType());
+ inCoordinatedNounPhrase = true;
+ }
+ else {
+ break;
+ }
+ }
+ lastNpTokenIndex = ti - 1;
+ }
+ else if (inCoordinatedNounPhrase && tokStr.equals(",")) {
+ if (lastNpTokenIndex != ti) {
+ Span npSpan = new Span((npTokens.get(ti + 1)).getSpan().getStart(), (npTokens.get(lastNpTokenIndex)).getSpan().getEnd());
+ Mention snpExtent = new Mention(npSpan, npSpan, tok.getEntityId(), null,"CNP");
+ entities.add(snpExtent);
+ //System.err.println("adding extent for comma in: "+np);
+ }
+ lastNpTokenIndex = ti - 1;
+ }
+ else if (inCoordinatedNounPhrase && ti == 0 && lastNpTokenIndex >= 0) {
+ Span npSpan = new Span((npTokens.get(ti)).getSpan().getStart(), (npTokens.get(lastNpTokenIndex)).getSpan().getEnd());
+ Mention snpExtent = new Mention(npSpan, npSpan, tok.getEntityId(), null,"CNP");
+ entities.add(snpExtent);
+ //System.err.println("adding extent for start coord in: "+np);
+ }
+ }
+ }
+
+ private boolean handledPronoun(String tok) {
+ return ResolverUtils.singularThirdPersonPronounPattern.matcher(tok).find() ||
+ ResolverUtils.pluralThirdPersonPronounPattern.matcher(tok).find() ||
+ ResolverUtils.speechPronounPattern.matcher(tok).find();
+ }
+
+ private void collectPossesivePronouns(Parse np, List<Mention> entities) {
+ //TODO: Look at how training is done and examine whether this is needed or can be accomidated in a different way.
+ /*
+ List snps = np.getSubNounPhrases();
+ if (snps.size() != 0) {
+ //System.err.println("AbstractMentionFinder: Found existing snps");
+ for (int si = 0, sl = snps.size(); si < sl; si++) {
+ Parse snp = (Parse) snps.get(si);
+ Extent ppExtent = new Extent(snp.getSpan(), snp.getSpan(), snp.getEntityId(), null,Linker.PRONOUN_MODIFIER);
+ entities.add(ppExtent);
+ }
+ }
+ else {
+ */
+ //System.err.println("AbstractEntityFinder.collectPossesivePronouns: "+np);
+ List<Parse> npTokens = np.getTokens();
+ Parse headToken = headFinder.getHeadToken(np);
+ for (int ti = npTokens.size() - 2; ti >= 0; ti--) {
+ Parse tok = npTokens.get(ti);
+ if (tok == headToken) {
+ continue;
+ }
+ if (tok.getSyntacticType().startsWith("PRP") && handledPronoun(tok.toString())) {
+ Mention ppExtent = new Mention(tok.getSpan(), tok.getSpan(), tok.getEntityId(), null,Linker.PRONOUN_MODIFIER);
+ //System.err.println("AbstractEntityFinder.collectPossesivePronouns: adding possesive pronoun: "+tok+" "+tok.getEntityId());
+ entities.add(ppExtent);
+ //System.err.println("AbstractMentionFinder: adding pos-pro: "+ppExtent);
+ break;
+ }
+ }
+ //}
+ }
+
+ private void removeDuplicates(List<Mention> extents) {
+ Mention lastExtent = null;
+ for (Iterator<Mention> ei = extents.iterator(); ei.hasNext();) {
+ Mention e = ei.next();
+ if (lastExtent != null && e.getSpan().equals(lastExtent.getSpan())) {
+ ei.remove();
+ }
+ else {
+ lastExtent = e;
+ }
+ }
+ }
+
+ private boolean isHeadOfExistingMention(Parse np, Map<Parse, Parse> headMap,
+ Set<Parse> mentions) {
+ Parse head = headMap.get(np);
+ while(head != null){
+ if (mentions.contains(head)) {
+ return true;
+ }
+ head = headMap.get(head);
+ }
+ return false;
+ }
+
+
+ private void clearMentions(Set<Parse> mentions, Parse np) {
+ Span npSpan =np.getSpan();
+ for(Iterator<Parse> mi=mentions.iterator();mi.hasNext();) {
+ Parse mention = mi.next();
+ if (!mention.getSpan().contains(npSpan)) {
+ //System.err.println("clearing "+mention+" for "+np);
+ mi.remove();
+ }
+ }
+ }
+
+ private Mention[] collectMentions(List<Parse> nps, Map<Parse, Parse> headMap) {
+ List<Mention> mentions = new ArrayList<Mention>(nps.size());
+ Set<Parse> recentMentions = new HashSet<Parse>();
+ //System.err.println("AbtractMentionFinder.collectMentions: "+headMap);
+ for (int npi = 0, npl = nps.size(); npi < npl; npi++) {
+ Parse np = nps.get(npi);
+ //System.err.println("AbstractMentionFinder: collectMentions: np[" + npi + "]=" + np + " head=" + headMap.get(np));
+ if (!isHeadOfExistingMention(np,headMap, recentMentions)) {
+ clearMentions(recentMentions, np);
+ if (!isPartOfName(np)) {
+ Parse head = headFinder.getLastHead(np);
+ Mention extent = new Mention(np.getSpan(), head.getSpan(), head.getEntityId(), np, null);
+ //System.err.println("adding "+np+" with head "+head);
+ mentions.add(extent);
+ recentMentions.add(np);
+ // determine name-entity type
+ String entityType = getEntityType(headFinder.getHeadToken(head));
+ if (entityType != null) {
+ extent.setNameType(entityType);
+ }
+ }
+ else {
+ //System.err.println("AbstractMentionFinder.collectMentions excluding np as part of name. np=" + np);
+ }
+ }
+ else {
+ //System.err.println("AbstractMentionFinder.collectMentions excluding np as head of previous mention. np=" + np);
+ }
+ if (isBasalNounPhrase(np)) {
+ if (collectPrenominalNamedEntities) {
+ collectPrenominalNamedEntities(np, mentions);
+ }
+ if (collectCoordinatedNounPhrases) {
+ collectCoordinatedNounPhraseMentions(np, mentions);
+ }
+ collectPossesivePronouns(np, mentions);
+ }
+ else {
+ // Could use to get NP -> tokens CON structures for basal nps including NP -> NAC tokens
+ //collectComplexNounPhrases(np,mentions);
+ }
+ }
+ Collections.sort(mentions);
+ removeDuplicates(mentions);
+ return mentions.toArray(new Mention[mentions.size()]);
+ }
+
+ /**
+ * Adds a mention for the non-treebank-labeled possesive noun phrases.
+ * @param possesiveNounPhrase The possesive noun phase which may require an additional mention.
+ * @param mentions The list of mentions into which a new mention can be added.
+ */
+// private void addPossesiveMentions(Parse possesiveNounPhrase, List<Mention> mentions) {
+// List<Parse> kids = possesiveNounPhrase.getSyntacticChildren();
+// if (kids.size() >1) {
+// Parse firstToken = kids.get(1);
+// if (firstToken.isToken() && !firstToken.getSyntacticType().equals("POS")) {
+// Parse lastToken = kids.get(kids.size()-1);
+// if (lastToken.isToken()) {
+// Span extentSpan = new Span(firstToken.getSpan().getStart(),lastToken.getSpan().getEnd());
+// Mention extent = new Mention(extentSpan, extentSpan, -1, null, null);
+// mentions.add(extent);
+// }
+// else {
+// System.err.println("AbstractMentionFinder.addPossesiveMentions: odd parse structure: "+possesiveNounPhrase);
+// }
+// }
+// }
+// }
+
+ private void collectPrenominalNamedEntities(Parse np, List<Mention> extents) {
+ Parse htoken = headFinder.getHeadToken(np);
+ List<Parse> nes = np.getNamedEntities();
+ Span headTokenSpan = htoken.getSpan();
+ for (int nei = 0, nel = nes.size(); nei < nel; nei++) {
+ Parse ne = nes.get(nei);
+ if (!ne.getSpan().contains(headTokenSpan)) {
+ //System.err.println("adding extent for prenominal ne: "+ne);
+ Mention extent = new Mention(ne.getSpan(), ne.getSpan(), ne.getEntityId(),null,"NAME");
+ extent.setNameType(ne.getEntityType());
+ extents.add(extent);
+ }
+ }
+ }
+
+ private String getEntityType(Parse headToken) {
+ String entityType;
+ for (Parse parent = headToken.getParent(); parent != null; parent = parent.getParent()) {
+ entityType = parent.getEntityType();
+ if (entityType != null) {
+ return entityType;
+ }
+ if (parent.isSentence()) {
+ break;
+ }
+ }
+ List<Parse> tc = headToken.getChildren();
+ int tcs = tc.size();
+ if (tcs > 0) {
+ Parse tchild = tc.get(tcs - 1);
+ entityType = tchild.getEntityType();
+ if (entityType != null) {
+ return entityType;
+ }
+ }
+ return null;
+ }
+
+ private boolean isPartOfName(Parse np) {
+ String entityType;
+ for (Parse parent = np.getParent(); parent != null; parent = parent.getParent()) {
+ entityType = parent.getEntityType();
+ //System.err.println("AbstractMentionFinder.isPartOfName: entityType="+entityType);
+ if (entityType != null) {
+ //System.err.println("npSpan = "+np.getSpan()+" parentSpan="+parent.getSpan());
+ if (!np.getSpan().contains(parent.getSpan())) {
+ return true;
+ }
+ }
+ if (parent.isSentence()) {
+ break;
+ }
+ }
+ return false;
+ }
+
+ /** Return all noun phrases which are contained by <code>p</code>.
+ * @param p The parse in which to find the noun phrases.
+ * @return A list of <code>Parse</code> objects which are noun phrases contained by <code>p</code>.
+ */
+ //protected abstract List getNounPhrases(Parse p);
+
+ public List<Parse> getNamedEntities(Parse p) {
+ return p.getNamedEntities();
+ }
+
+ public Mention[] getMentions(Parse p) {
+ List<Parse> nps = p.getNounPhrases();
+ Collections.sort(nps);
+ Map<Parse, Parse> headMap = constructHeadMap(nps);
+ //System.err.println("AbstractMentionFinder.getMentions: got " + nps.size()); // + " nps, and " + nes.size() + " named entities");
+ Mention[] mentions = collectMentions(nps, headMap);
+ return mentions;
+ }
+
+ public boolean isCoordinatedNounPhraseCollection() {
+ return collectCoordinatedNounPhrases;
+ }
+
+ public void setCoordinatedNounPhraseCollection(boolean b) {
+ collectCoordinatedNounPhrases = b;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/AbstractParse.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/AbstractParse.java
new file mode 100644
index 0000000..b9fcfd3
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/AbstractParse.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Provides default implemenation of many of the methods in the {@link Parse} interface.
+ */
+public abstract class AbstractParse implements Parse {
+
+ public boolean isCoordinatedNounPhrase() {
+ List<Parse> parts = getSyntacticChildren();
+ if (parts.size() >= 2) {
+ for (int pi = 1; pi < parts.size(); pi++) {
+ Parse child = parts.get(pi);
+ String ctype = child.getSyntacticType();
+ if (ctype != null && ctype.equals("CC") && !child.toString().equals("&")) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ public List<Parse> getNounPhrases() {
+ List<Parse> parts = getSyntacticChildren();
+ List<Parse> nps = new ArrayList<Parse>();
+ while (parts.size() > 0) {
+ List<Parse> newParts = new ArrayList<Parse>();
+ for (int pi=0,pn=parts.size();pi<pn;pi++) {
+ //System.err.println("AbstractParse.getNounPhrases "+parts.get(pi).getClass());
+ Parse cp = parts.get(pi);
+ if (cp.isNounPhrase()) {
+ nps.add(cp);
+ }
+ if (!cp.isToken()) {
+ newParts.addAll(cp.getSyntacticChildren());
+ }
+ }
+ parts = newParts;
+ }
+ return nps;
+ }
+ }
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DefaultParse.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DefaultParse.java
new file mode 100644
index 0000000..e096b7b
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DefaultParse.java
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import java.util.Stack;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.chunking.Parser;
+import opennlp.tools.util.Span;
+
+/**
+ * This class is a wrapper for {@link opennlp.tools.parser.Parse} mapping it to the API specified in {@link opennlp.tools.coref.mention.Parse}.
+ * This allows coreference to be done on the output of the parser.
+ */
+public class DefaultParse extends AbstractParse {
+
+ public static String[] NAME_TYPES = {"person", "organization", "location", "date", "time", "percentage", "money"};
+
+ private Parse parse;
+ private int sentenceNumber;
+ private static Set<String> entitySet = new HashSet<String>(Arrays.asList(NAME_TYPES));
+
+ /**
+ * Initializes the current instance.
+ *
+ * @param parse
+ * @param sentenceNumber
+ */
+ public DefaultParse(Parse parse, int sentenceNumber) {
+ this.parse = parse;
+ this.sentenceNumber = sentenceNumber;
+
+ // Should we just maintain a parse id map !?
+ }
+
+ public int getSentenceNumber() {
+ return sentenceNumber;
+ }
+
+ public List<opennlp.tools.coref.mention.Parse> getNamedEntities() {
+ List<Parse> names = new ArrayList<Parse>();
+ List<Parse> kids = new LinkedList<Parse>(Arrays.asList(parse.getChildren()));
+ while (kids.size() > 0) {
+ Parse p = kids.remove(0);
+ if (entitySet.contains(p.getType())) {
+ names.add(p);
+ }
+ else {
+ kids.addAll(Arrays.asList(p.getChildren()));
+ }
+ }
+ return createParses(names.toArray(new Parse[names.size()]));
+ }
+
+ public List<opennlp.tools.coref.mention.Parse> getChildren() {
+ return createParses(parse.getChildren());
+ }
+
+ public List<opennlp.tools.coref.mention.Parse> getSyntacticChildren() {
+ List<Parse> kids = new ArrayList<Parse>(Arrays.asList(parse.getChildren()));
+ for (int ci = 0; ci < kids.size(); ci++) {
+ Parse kid = kids.get(ci);
+ if (entitySet.contains(kid.getType())) {
+ kids.remove(ci);
+ kids.addAll(ci, Arrays.asList(kid.getChildren()));
+ ci--;
+ }
+ }
+ return createParses(kids.toArray(new Parse[kids.size()]));
+ }
+
+ public List<opennlp.tools.coref.mention.Parse> getTokens() {
+ List<Parse> tokens = new ArrayList<Parse>();
+ List<Parse> kids = new LinkedList<Parse>(Arrays.asList(parse.getChildren()));
+ while (kids.size() > 0) {
+ Parse p = kids.remove(0);
+ if (p.isPosTag()) {
+ tokens.add(p);
+ }
+ else {
+ kids.addAll(0,Arrays.asList(p.getChildren()));
+ }
+ }
+ return createParses(tokens.toArray(new Parse[tokens.size()]));
+ }
+
+ public String getSyntacticType() {
+ if (entitySet.contains(parse.getType())) {
+ return null;
+ }
+ else if (parse.getType().contains("#")) {
+ return parse.getType().substring(0, parse.getType().indexOf('#'));
+ }
+ else {
+ return parse.getType();
+ }
+ }
+
+ private List<opennlp.tools.coref.mention.Parse> createParses(Parse[] parses) {
+ List<opennlp.tools.coref.mention.Parse> newParses =
+ new ArrayList<opennlp.tools.coref.mention.Parse>(parses.length);
+
+ for (int pi=0,pn=parses.length;pi<pn;pi++) {
+ newParses.add(new DefaultParse(parses[pi],sentenceNumber));
+ }
+
+ return newParses;
+ }
+
+ public String getEntityType() {
+ if (entitySet.contains(parse.getType())) {
+ return parse.getType();
+ }
+ else {
+ return null;
+ }
+ }
+
+ public boolean isParentNAC() {
+ Parse parent = parse.getParent();
+ while(parent != null) {
+ if (parent.getType().equals("NAC")) {
+ return true;
+ }
+ parent = parent.getParent();
+ }
+ return false;
+ }
+
+ public opennlp.tools.coref.mention.Parse getParent() {
+ Parse parent = parse.getParent();
+ if (parent == null) {
+ return null;
+ }
+ else {
+ return new DefaultParse(parent,sentenceNumber);
+ }
+ }
+
+ public boolean isNamedEntity() {
+
+ // TODO: We should use here a special tag to, where
+ // the type can be extracted from. Then it just depends
+ // on the training data and not the values inside NAME_TYPES.
+
+ if (entitySet.contains(parse.getType())) {
+ return true;
+ }
+ else {
+ return false;
+ }
+ }
+
+ public boolean isNounPhrase() {
+ return parse.getType().equals("NP") || parse.getType().startsWith("NP#");
+ }
+
+ public boolean isSentence() {
+ return parse.getType().equals(Parser.TOP_NODE);
+ }
+
+ public boolean isToken() {
+ return parse.isPosTag();
+ }
+
+ public int getEntityId() {
+
+ String type = parse.getType();
+
+ if (type.contains("#")) {
+ String numberString = type.substring(type.indexOf('#') + 1);
+ return Integer.parseInt(numberString);
+ }
+ else {
+ return -1;
+ }
+ }
+
+ public Span getSpan() {
+ return parse.getSpan();
+ }
+
+ public int compareTo(opennlp.tools.coref.mention.Parse p) {
+
+ if (p == this) {
+ return 0;
+ }
+
+ if (getSentenceNumber() < p.getSentenceNumber()) {
+ return -1;
+ }
+ else if (getSentenceNumber() > p.getSentenceNumber()) {
+ return 1;
+ }
+ else {
+
+ if (parse.getSpan().getStart() == p.getSpan().getStart() &&
+ parse.getSpan().getEnd() == p.getSpan().getEnd()) {
+
+ System.out.println("Maybe incorrect measurement!");
+
+ Stack<Parse> parents = new Stack<Parse>();
+
+
+
+
+ // get parent and update distance
+ // if match return distance
+ // if not match do it again
+ }
+
+ return parse.getSpan().compareTo(p.getSpan());
+ }
+ }
+
+ @Override
+ public String toString() {
+ return parse.getCoveredText();
+ }
+
+
+ public opennlp.tools.coref.mention.Parse getPreviousToken() {
+ Parse parent = parse.getParent();
+ Parse node = parse;
+ int index=-1;
+ //find parent with previous children
+ while(parent != null && index < 0) {
+ index = parent.indexOf(node)-1;
+ if (index < 0) {
+ node = parent;
+ parent = parent.getParent();
+ }
+ }
+ //find right-most child which is a token
+ if (index < 0) {
+ return null;
+ }
+ else {
+ Parse p = parent.getChildren()[index];
+ while (!p.isPosTag()) {
+ Parse[] kids = p.getChildren();
+ p = kids[kids.length-1];
+ }
+ return new DefaultParse(p,sentenceNumber);
+ }
+ }
+
+ public opennlp.tools.coref.mention.Parse getNextToken() {
+ Parse parent = parse.getParent();
+ Parse node = parse;
+ int index=-1;
+ //find parent with subsequent children
+ while(parent != null) {
+ index = parent.indexOf(node)+1;
+ if (index == parent.getChildCount()) {
+ node = parent;
+ parent = parent.getParent();
+ }
+ else {
+ break;
+ }
+ }
+ //find left-most child which is a token
+ if (parent == null) {
+ return null;
+ }
+ else {
+ Parse p = parent.getChildren()[index];
+ while (!p.isPosTag()) {
+ p = p.getChildren()[0];
+ }
+ return new DefaultParse(p,sentenceNumber);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+
+ boolean result;
+
+ if (o == this) {
+ result = true;
+ }
+ else if (o instanceof DefaultParse) {
+ result = parse == ((DefaultParse) o).parse;
+ }
+ else {
+ result = false;
+ }
+
+ return result;
+ }
+
+ @Override
+ public int hashCode() {
+ return parse.hashCode();
+ }
+
+ /**
+ * Retrieves the {@link Parse}.
+ *
+ * @return the {@link Parse}
+ */
+ public Parse getParse() {
+ return parse;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Dictionary.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Dictionary.java
new file mode 100644
index 0000000..ef18faa
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Dictionary.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+/**
+ * Interface to provide dictionary information to the coreference module assuming a
+ * hierarchically structured dictionary (such as WordNet) is available.
+ */
+public interface Dictionary {
+
+ /**
+ * Returns the lemmas of the specified word with the specified part-of-speech.
+ *
+ * @param word The word whose lemmas are desired.
+ * @param pos The part-of-speech of the specified word.
+ * @return The lemmas of the specified word given the specified part-of-speech.
+ */
+ public String[] getLemmas(String word, String pos);
+
+ /**
+ * Returns a key indicating the specified sense number of the specified
+ * lemma with the specified part-of-speech.
+ *
+ * @param lemma The lemmas for which the key is desired.
+ * @param pos The pos for which the key is desired.
+ * @param senseNumber The sense number for which the key is desired.
+ * @return a key indicating the specified sense number of the specified
+ * lemma with the specified part-of-speech.
+ */
+ public String getSenseKey(String lemma, String pos, int senseNumber);
+
+ /**
+ * Returns the number of senses in the dictionary for the specified lemma.
+ *
+ * @param lemma A lemmatized form of the word to look up.
+ * @param pos The part-of-speech for the lemma.
+ * @return the number of senses in the dictionary for the specified lemma.
+ */
+ public int getNumSenses(String lemma, String pos);
+
+ /**
+ * Returns an array of keys for each parent of the specified sense number of the specified lemma with the specified part-of-speech.
+ *
+ * @param lemma A lemmatized form of the word to look up.
+ * @param pos The part-of-speech for the lemma.
+ * @param senseNumber The sense number for which the parent keys are desired.
+ * @return an array of keys for each parent of the specified sense number of the specified lemma with the specified part-of-speech.
+ */
+ public String[] getParentSenseKeys(String lemma, String pos, int senseNumber);
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DictionaryFactory.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DictionaryFactory.java
new file mode 100644
index 0000000..eb0e402
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DictionaryFactory.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import java.io.IOException;
+
+import net.didion.jwnl.JWNLException;
+
+/** Factory class used to get an instance of a dictionary object.
+ * @see opennlp.tools.coref.mention.Dictionary
+ * */
+public class DictionaryFactory {
+
+ private static Dictionary dictionary;
+
+ /**
+ * Returns the default implementation of the Dictionary interface.
+ * @return the default implementation of the Dictionary interface.
+ */
+ public static Dictionary getDictionary() {
+ if (dictionary == null) {
+ try {
+ dictionary = new JWNLDictionary(System.getProperty("WNSEARCHDIR"));
+ }
+ catch(IOException e) {
+ System.err.println(e);
+ }
+ catch(JWNLException e) {
+ System.err.println(e);
+ }
+ }
+ return dictionary;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/HeadFinder.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/HeadFinder.java
new file mode 100644
index 0000000..b378ef9
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/HeadFinder.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+/**
+ * Interface for finding head words in noun phrases and head noun-phrases in parses.
+ */
+public interface HeadFinder {
+
+ /**
+ * Returns the child parse which contains the lexical head of the specified parse.
+ *
+ * @param parse The parse in which to find the head.
+ * @return The parse containing the lexical head of the specified parse. If no head is
+ * available or the constituent has no sub-components that are eligible heads then null is returned.
+ */
+ public Parse getHead(Parse parse);
+
+ /**
+ * Returns which index the specified list of token is the head word.
+ *
+ * @param parse The parse in which to find the head index.
+ * @return The index of the head token.
+ */
+ public int getHeadIndex(Parse parse);
+
+ /**
+ * Returns the parse bottom-most head of a <code>Parse</code>. If no
+ * head is available which is a child of <code>p</code> then <code>p</code> is returned.
+ *
+ * @param p Parse to find the head of.
+ * @return bottom-most head of p.
+ */
+ public Parse getLastHead(Parse p);
+
+ /**
+ * Returns head token for the specified np parse.
+ *
+ * @param np The noun parse to get head from.
+ * @return head token parse.
+ */
+ public Parse getHeadToken(Parse np);
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/JWNLDictionary.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/JWNLDictionary.java
new file mode 100644
index 0000000..2c2d4ee
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/JWNLDictionary.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import net.didion.jwnl.JWNLException;
+import net.didion.jwnl.data.Adjective;
+import net.didion.jwnl.data.FileDictionaryElementFactory;
+import net.didion.jwnl.data.IndexWord;
+import net.didion.jwnl.data.POS;
+import net.didion.jwnl.data.Pointer;
+import net.didion.jwnl.data.PointerType;
+import net.didion.jwnl.data.Synset;
+import net.didion.jwnl.data.VerbFrame;
+import net.didion.jwnl.dictionary.FileBackedDictionary;
+import net.didion.jwnl.dictionary.MorphologicalProcessor;
+import net.didion.jwnl.dictionary.file_manager.FileManager;
+import net.didion.jwnl.dictionary.file_manager.FileManagerImpl;
+import net.didion.jwnl.dictionary.morph.DefaultMorphologicalProcessor;
+import net.didion.jwnl.dictionary.morph.DetachSuffixesOperation;
+import net.didion.jwnl.dictionary.morph.LookupExceptionsOperation;
+import net.didion.jwnl.dictionary.morph.LookupIndexWordOperation;
+import net.didion.jwnl.dictionary.morph.Operation;
+import net.didion.jwnl.dictionary.morph.TokenizerOperation;
+import net.didion.jwnl.princeton.data.PrincetonWN17FileDictionaryElementFactory;
+import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile;
+
+/**
+ * An implementation of the Dictionary interface using the JWNL library.
+ */
+public class JWNLDictionary implements Dictionary {
+
+ private net.didion.jwnl.dictionary.Dictionary dict;
+ private MorphologicalProcessor morphy;
+ private static String[] empty = new String[0];
+
+ public JWNLDictionary(String searchDirectory) throws IOException, JWNLException {
+ PointerType.initialize();
+ Adjective.initialize();
+ VerbFrame.initialize();
+ Map<POS, String[][]> suffixMap = new HashMap<POS, String[][]>();
+ suffixMap.put(POS.NOUN,new String[][] {{"s",""},{"ses","s"},{"xes","x"},{"zes","z"},{"ches","ch"},{"shes","sh"},{"men","man"},{"ies","y"}});
+ suffixMap.put(POS.VERB,new String[][] {{"s",""},{"ies","y"},{"es","e"},{"es",""},{"ed","e"},{"ed",""},{"ing","e"},{"ing",""}});
+ suffixMap.put(POS.ADJECTIVE,new String[][] {{"er",""},{"est",""},{"er","e"},{"est","e"}});
+ DetachSuffixesOperation tokDso = new DetachSuffixesOperation(suffixMap);
+ tokDso.addDelegate(DetachSuffixesOperation.OPERATIONS,new Operation[] {new LookupIndexWordOperation(),new LookupExceptionsOperation()});
+ TokenizerOperation tokOp = new TokenizerOperation(new String[] {" ","-"});
+ tokOp.addDelegate(TokenizerOperation.TOKEN_OPERATIONS,new Operation[] {new LookupIndexWordOperation(),new LookupExceptionsOperation(),tokDso});
+ DetachSuffixesOperation morphDso = new DetachSuffixesOperation(suffixMap);
+ morphDso.addDelegate(DetachSuffixesOperation.OPERATIONS,new Operation[] {new LookupIndexWordOperation(),new LookupExceptionsOperation()});
+ Operation[] operations = {new LookupExceptionsOperation(), morphDso , tokOp};
+ morphy = new DefaultMorphologicalProcessor(operations);
+ FileManager manager = new FileManagerImpl(searchDirectory,PrincetonRandomAccessDictionaryFile.class);
+ FileDictionaryElementFactory factory = new PrincetonWN17FileDictionaryElementFactory();
+ FileBackedDictionary.install(manager, morphy,factory,true);
+ dict = net.didion.jwnl.dictionary.Dictionary.getInstance();
+ morphy = dict.getMorphologicalProcessor();
+ }
+
+ @SuppressWarnings("unchecked")
+ public String[] getLemmas(String word, String tag) {
+ try {
+ POS pos;
+ if (tag.startsWith("N") || tag.startsWith("n")) {
+ pos = POS.NOUN;
+ }
+ else if (tag.startsWith("V") || tag.startsWith("v")) {
+ pos = POS.VERB;
+ }
+ else if (tag.startsWith("J") || tag.startsWith("a")) {
+ pos = POS.ADJECTIVE;
+ }
+ else if (tag.startsWith("R") || tag.startsWith("r")) {
+ pos = POS.ADVERB;
+ }
+ else {
+ pos = POS.NOUN;
+ }
+ List<String> lemmas = morphy.lookupAllBaseForms(pos,word);
+ return lemmas.toArray(new String[lemmas.size()]);
+ }
+ catch (JWNLException e) {
+ e.printStackTrace();
+ return null;
+ }
+ }
+
+ public String getSenseKey(String lemma, String pos,int sense) {
+ try {
+ IndexWord iw = dict.getIndexWord(POS.NOUN,lemma);
+ if (iw == null) {
+ return null;
+ }
+ return String.valueOf(iw.getSynsetOffsets()[sense]);
+ }
+ catch (JWNLException e) {
+ e.printStackTrace();
+ return null;
+ }
+
+ }
+
+ public int getNumSenses(String lemma, String pos) {
+ try {
+ IndexWord iw = dict.getIndexWord(POS.NOUN,lemma);
+ if (iw == null){
+ return 0;
+ }
+ return iw.getSenseCount();
+ }
+ catch (JWNLException e) {
+ return 0;
+ }
+ }
+
+ private void getParents(Synset synset, List<String> parents) throws JWNLException {
+ Pointer[] pointers = synset.getPointers();
+ for (int pi=0,pn=pointers.length;pi<pn;pi++) {
+ if (pointers[pi].getType() == PointerType.HYPERNYM) {
+ Synset parent = pointers[pi].getTargetSynset();
+ parents.add(String.valueOf(parent.getOffset()));
+ getParents(parent,parents);
+ }
+ }
+ }
+
+ public String[] getParentSenseKeys(String lemma, String pos, int sense) {
+ //System.err.println("JWNLDictionary.getParentSenseKeys: lemma="+lemma);
+ try {
+ IndexWord iw = dict.getIndexWord(POS.NOUN,lemma);
+ if (iw != null) {
+ Synset synset = iw.getSense(sense+1);
+ List<String> parents = new ArrayList<String>();
+ getParents(synset,parents);
+ return parents.toArray(new String[parents.size()]);
+ }
+ else {
+ return empty;
+ }
+ }
+ catch (JWNLException e) {
+ e.printStackTrace();
+ return null;
+ }
+ }
+
+ public static void main(String[] args) throws IOException, JWNLException {
+ String searchDir = System.getProperty("WNSEARCHDIR");
+ System.err.println("searchDir="+searchDir);
+ if (searchDir != null) {
+ Dictionary dict = new JWNLDictionary(System.getProperty("WNSEARCHDIR"));
+ String word = args[0];
+ String[] lemmas = dict.getLemmas(word,"NN");
+ for (int li=0,ln=lemmas.length;li<ln;li++) {
+ for (int si=0,sn=dict.getNumSenses(lemmas[li],"NN");si<sn;si++) {
+ System.out.println(lemmas[li]+" ("+si+")\t"+java.util.Arrays.asList(dict.getParentSenseKeys(lemmas[li],"NN",si)));
+ }
+ }
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Mention.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Mention.java
new file mode 100644
index 0000000..9593eaf
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Mention.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import opennlp.tools.util.Span;
+
+/**
+ * Data structure representation of a mention.
+ */
+public class Mention implements Comparable<Mention> {
+
+ /**
+ * Represents the character offset for this extent.
+ */
+ private Span span;
+
+ /**
+ * A string representing the type of this extent. This is helpful for determining
+ * which piece of code created a particular extent.
+ */
+ protected String type;
+
+ /**
+ * The entity id indicating which entity this extent belongs to. This is only
+ * used when training a coreference classifier.
+ */
+ private int id;
+
+ /**
+ * Represents the character offsets of the the head of this extent.
+ */
+ private Span headSpan;
+
+ /**
+ * The parse node that this extent is based on.
+ */
+ protected Parse parse;
+
+ /**
+ * A string representing the name type for this extent.
+ */
+ protected String nameType;
+
+ public Mention(Span span, Span headSpan, int entityId, Parse parse, String extentType) {
+ this.span=span;
+ this.headSpan=headSpan;
+ this.id=entityId;
+ this.type=extentType;
+ this.parse = parse;
+ }
+
+ public Mention(Span span, Span headSpan, int entityId, Parse parse, String extentType, String nameType) {
+ this.span=span;
+ this.headSpan=headSpan;
+ this.id=entityId;
+ this.type=extentType;
+ this.parse = parse;
+ this.nameType = nameType;
+ }
+
+ public Mention(Mention mention) {
+ this(mention.span,mention.headSpan,mention.id,mention.parse,mention.type,mention.nameType);
+ }
+
+ /**
+ * Returns the character offsets for this extent.
+ *
+ * @return The span representing the character offsets of this extent.
+ */
+ public Span getSpan() {
+ return span;
+ }
+
+ /**
+ * Returns the character offsets for the head of this extent.
+ *
+ * @return The span representing the character offsets for the head of this extent.
+ */
+ public Span getHeadSpan() {
+ return headSpan;
+ }
+
+ /**
+ * Returns the parse node that this extent is based on.
+ *
+ * @return The parse node that this extent is based on or null if the extent is newly created.
+ */
+ public Parse getParse() {
+ return parse;
+ }
+
+ public int compareTo(Mention e) {
+ return span.compareTo(e.span);
+ }
+
+ /**
+ * Specifies the parse for this mention.
+ * @param parse The parse for this mention.
+ */
+ public void setParse(Parse parse) {
+ this.parse = parse;
+ }
+
+ /**
+ * Returns the named-entity category associated with this mention.
+ *
+ * @return the named-entity category associated with this mention.
+ */
+ public String getNameType() {
+ return nameType;
+ }
+
+ /**
+ * Specifies the named-entity category associated with this mention.
+ *
+ * @param nameType the named-entity category associated with this mention.
+ */
+ protected void setNameType(String nameType) {
+ this.nameType = nameType;
+ }
+
+ /**
+ * Associates an id with this mention.
+ *
+ * @param i The id for this mention.
+ */
+ public void setId(int i) {
+ id=i;
+ }
+
+ /**
+ * Returns the id associated with this mention.
+ *
+ * @return the id associated with this mention.
+ */
+ public int getId() {
+ return id;
+ }
+
+ @Override
+ public String toString() {
+ return "mention(span="+span+",hs="+headSpan+", type="+type+", id="+id+" "+parse+" )";
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/MentionContext.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/MentionContext.java
new file mode 100644
index 0000000..be81b79
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/MentionContext.java
@@ -0,0 +1,419 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import java.util.List;
+
+import opennlp.tools.coref.sim.Context;
+import opennlp.tools.coref.sim.GenderEnum;
+import opennlp.tools.coref.sim.NumberEnum;
+import opennlp.tools.util.Span;
+
+/**
+ * Data structure representation of a mention with additional contextual information.
+ * The contextual information is used in performing coreference resolution.
+ */
+public class MentionContext extends Context {
+
+ /**
+ * The index of first token which is not part of a descriptor. This is 0 if no descriptor is present.
+ */
+ private int nonDescriptorStart;
+
+ /**
+ * The Parse of the head constituent of this mention.
+ */
+ private Parse head;
+
+ /**
+ * Sentence-token-based span whose end is the last token of the mention.
+ */
+ private Span indexSpan;
+
+ /**
+ * Position of the NP in the sentence.
+ */
+ private int nounLocation;
+
+ /**
+ * Position of the NP in the document.
+ */
+ private int nounNumber;
+
+ /**
+ * Number of noun phrases in the sentence which contains this mention.
+ */
+ private int maxNounLocation;
+
+ /**
+ * Index of the sentence in the document which contains this mention.
+ */
+ private int sentenceNumber;
+
+ /**
+ * The token preceding this mention's maximal noun phrase.
+ */
+ private Parse prevToken;
+
+ /**
+ * The token following this mention's maximal noun phrase.
+ */
+ private Parse nextToken;
+
+ /**
+ * The token following this mention's basal noun phrase.
+ */
+ private Parse basalNextToken;
+
+ /**
+ * The parse of the mention's head word.
+ */
+ private Parse headToken;
+
+ /**
+ * The parse of the first word in the mention.
+ */
+ private Parse firstToken;
+
+ /**
+ * The text of the first word in the mention.
+ */
+ private String firstTokenText;
+
+ /**
+ * The pos-tag of the first word in the mention.
+ */
+ private String firstTokenTag;
+
+ /**
+ * The gender assigned to this mention.
+ */
+ private GenderEnum gender;
+
+ /**
+ * The probability associated with the gender assignment.
+ */
+ private double genderProb;
+
+ /**
+ * The number assigned to this mention.
+ */
+ private NumberEnum number;
+
+ /**
+ * The probability associated with the number assignment.
+ */
+ private double numberProb;
+
+ public MentionContext(Span span, Span headSpan, int entityId, Parse parse, String extentType, String nameType, int mentionIndex, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, HeadFinder headFinder) {
+ super(span,headSpan,entityId,parse,extentType,nameType,headFinder);
+ nounLocation = mentionIndex;
+ maxNounLocation = mentionsInSentence;
+ nounNumber = mentionIndexInDocument;
+ sentenceNumber = sentenceIndex;
+ indexSpan = parse.getSpan();
+ prevToken = parse.getPreviousToken();
+ nextToken = parse.getNextToken();
+ head = headFinder.getLastHead(parse);
+ List<Parse> headTokens = head.getTokens();
+ tokens = headTokens.toArray(new Parse[headTokens.size()]);
+ basalNextToken = head.getNextToken();
+ //System.err.println("MentionContext.init: "+ent+" "+ent.getEntityId()+" head="+head);
+ nonDescriptorStart = 0;
+ initHeads(headFinder.getHeadIndex(head));
+ gender = GenderEnum.UNKNOWN;
+ this.genderProb = 0d;
+ number = NumberEnum.UNKNOWN;
+ this.numberProb = 0d;
+ }
+ /**
+ * Constructs context information for the specified mention.
+ *
+ * @param mention The mention object on which this object is based.
+ * @param mentionIndexInSentence The mention's position in the sentence.
+ * @param mentionsInSentence The number of mentions in the sentence.
+ * @param mentionIndexInDocument The index of this mention with respect to the document.
+ * @param sentenceIndex The index of the sentence which contains this mention.
+ * @param headFinder An object which provides head information.
+ */
+ public MentionContext(Mention mention, int mentionIndexInSentence, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, HeadFinder headFinder) {
+ this(mention.getSpan(),mention.getHeadSpan(),mention.getId(),mention.getParse(),mention.type,mention.nameType, mentionIndexInSentence,mentionsInSentence,mentionIndexInDocument,sentenceIndex,headFinder);
+ }
+
+
+ /**
+ * Constructs context information for the specified mention.
+ *
+ * @param mentionParse Mention parse structure for which context is to be constructed.
+ * @param mentionIndex mention position in sentence.
+ * @param mentionsInSentence Number of mentions in the sentence.
+ * @param mentionsInDocument Number of mentions in the document.
+ * @param sentenceIndex Sentence number for this mention.
+ * @param nameType The named-entity type for this mention.
+ * @param headFinder Object which provides head information.
+ */
+ /*
+ public MentionContext(Parse mentionParse, int mentionIndex, int mentionsInSentence, int mentionsInDocument, int sentenceIndex, String nameType, HeadFinder headFinder) {
+ nounLocation = mentionIndex;
+ maxNounLocation = mentionsInDocument;
+ sentenceNumber = sentenceIndex;
+ parse = mentionParse;
+ indexSpan = mentionParse.getSpan();
+ prevToken = mentionParse.getPreviousToken();
+ nextToken = mentionParse.getNextToken();
+ head = headFinder.getLastHead(mentionParse);
+ List headTokens = head.getTokens();
+ tokens = (Parse[]) headTokens.toArray(new Parse[headTokens.size()]);
+ basalNextToken = head.getNextToken();
+ //System.err.println("MentionContext.init: "+ent+" "+ent.getEntityId()+" head="+head);
+ indexHeadSpan = head.getSpan();
+ nonDescriptorStart = 0;
+ initHeads(headFinder.getHeadIndex(head));
+ this.neType= nameType;
+ if (getHeadTokenTag().startsWith("NN") && !getHeadTokenTag().startsWith("NNP")) {
+ //if (headTokenTag.startsWith("NNP") && neType != null) {
+ this.synsets = getSynsetSet(this);
+ }
+ else {
+ this.synsets=Collections.EMPTY_SET;
+ }
+ gender = GenderEnum.UNKNOWN;
+ this.genderProb = 0d;
+ number = NumberEnum.UNKNOWN;
+ this.numberProb = 0d;
+ }
+ */
+
+ private void initHeads(int headIndex) {
+ this.headTokenIndex=headIndex;
+ this.headToken = (Parse) tokens[getHeadTokenIndex()];
+ this.headTokenText = headToken.toString();
+ this.headTokenTag=headToken.getSyntacticType();
+ this.firstToken = (Parse) tokens[0];
+ this.firstTokenTag = firstToken.getSyntacticType();
+ this.firstTokenText=firstToken.toString();
+ }
+
+ /**
+ * Returns the parse of the head token for this mention.
+ *
+ * @return the parse of the head token for this mention.
+ */
+ public Parse getHeadTokenParse() {
+ return headToken;
+ }
+
+ public String getHeadText() {
+ StringBuilder headText = new StringBuilder();
+ for (int hsi = 0; hsi < tokens.length; hsi++) {
+ headText.append(" ").append(tokens[hsi].toString());
+ }
+ return headText.toString().substring(1);
+ }
+
+ public Parse getHead() {
+ return head;
+ }
+
+ public int getNonDescriptorStart() {
+ return this.nonDescriptorStart;
+ }
+
+ /**
+ * Returns a sentence-based token span for this mention. If this mention consist
+ * of the third, fourth, and fifth token, then this span will be 2..4.
+ *
+ * @return a sentence-based token span for this mention.
+ */
+ public Span getIndexSpan() {
+ return indexSpan;
+ }
+
+ /**
+ * Returns the index of the noun phrase for this mention in a sentence.
+ *
+ * @return the index of the noun phrase for this mention in a sentence.
+ */
+ public int getNounPhraseSentenceIndex() {
+ return nounLocation;
+ }
+
+ /**
+ * Returns the index of the noun phrase for this mention in a document.
+ *
+ * @return the index of the noun phrase for this mention in a document.
+ */
+ public int getNounPhraseDocumentIndex() {
+ return nounNumber;
+ }
+
+ /**
+ * Returns the index of the last noun phrase in the sentence containing this mention.
+ * This is one less than the number of noun phrases in the sentence which contains this mention.
+ *
+ * @return the index of the last noun phrase in the sentence containing this mention.
+ */
+ public int getMaxNounPhraseSentenceIndex() {
+ return maxNounLocation;
+ }
+
+ public Parse getNextTokenBasal() {
+ return basalNextToken;
+ }
+
+ public Parse getPreviousToken() {
+ return prevToken;
+ }
+
+ public Parse getNextToken() {
+ return nextToken;
+ }
+
+ /**
+ * Returns the index of the sentence which contains this mention.
+ *
+ * @return the index of the sentence which contains this mention.
+ */
+ public int getSentenceNumber() {
+ return sentenceNumber;
+ }
+
+ /**
+ * Returns the parse for the first token in this mention.
+ *
+ * @return The parse for the first token in this mention.
+ */
+ public Parse getFirstToken() {
+ return firstToken;
+ }
+
+ /**
+ * Returns the text for the first token of the mention.
+ *
+ * @return The text for the first token of the mention.
+ */
+ public String getFirstTokenText() {
+ return firstTokenText;
+ }
+
+ /**
+ * Returns the pos-tag of the first token of this mention.
+ *
+ * @return the pos-tag of the first token of this mention.
+ */
+ public String getFirstTokenTag() {
+ return firstTokenTag;
+ }
+
+ /**
+ * Returns the parses for the tokens which are contained in this mention.
+ *
+ * @return An array of parses, in order, for each token contained in this mention.
+ */
+ public Parse[] getTokenParses() {
+ return (Parse[]) tokens;
+ }
+
+ /**
+ * Returns the text of this mention.
+ *
+ * @return A space-delimited string of the tokens of this mention.
+ */
+ public String toText() {
+ return parse.toString();
+ }
+
+ /*
+ private static String[] getLemmas(MentionContext xec) {
+ //TODO: Try multi-word lemmas first.
+ String word = xec.getHeadTokenText();
+ return DictionaryFactory.getDictionary().getLemmas(word,"NN");
+ }
+
+ private static Set getSynsetSet(MentionContext xec) {
+ //System.err.println("getting synsets for mention:"+xec.toText());
+ Set synsetSet = new HashSet();
+ String[] lemmas = getLemmas(xec);
+ for (int li = 0; li < lemmas.length; li++) {
+ String[] synsets = DictionaryFactory.getDictionary().getParentSenseKeys(lemmas[li],"NN",0);
+ for (int si=0,sn=synsets.length;si<sn;si++) {
+ synsetSet.add(synsets[si]);
+ }
+ }
+ return (synsetSet);
+ }
+ */
+
+ /**
+ * Assigns the specified gender with the specified probability to this mention.
+ *
+ * @param gender The gender to be given to this mention.
+ * @param probability The probability associated with the gender assignment.
+ */
+ public void setGender(GenderEnum gender, double probability) {
+ this.gender = gender;
+ this.genderProb = probability;
+ }
+
+ /**
+ * Returns the gender of this mention.
+ *
+ * @return The gender of this mention.
+ */
+ public GenderEnum getGender() {
+ return gender;
+ }
+
+ /**
+ * Returns the probability associated with the gender assignment.
+ *
+ * @return The probability associated with the gender assignment.
+ */
+ public double getGenderProb() {
+ return genderProb;
+ }
+
+ /**
+ * Assigns the specified number with the specified probability to this mention.
+ *
+ * @param number The number to be given to this mention.
+ * @param probability The probability associated with the number assignment.
+ */
+ public void setNumber(NumberEnum number, double probability) {
+ this.number = number;
+ this.numberProb = probability;
+ }
+
+ /**
+ * Returns the number of this mention.
+ *
+ * @return The number of this mention.
+ */
+ public NumberEnum getNumber() {
+ return number;
+ }
+
+ /**
+ * Returns the probability associated with the number assignment.
+ *
+ * @return The probability associated with the number assignment.
+ */
+ public double getNumberProb() {
+ return numberProb;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/MentionFinder.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/MentionFinder.java
new file mode 100644
index 0000000..2337dea
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/MentionFinder.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.coref.mention;
+
+/**
+ * Specifies the interface that Objects which determine the space of
+ * mentions for coreference should implement.
+ */
+public interface MentionFinder {
+
+ /**
+ * Specifies whether pre-nominal named-entities should be collected as mentions.
+ *
+ * @param collectPrenominalNamedEntities true if pre-nominal named-entities should be collected; false otherwise.
+ */
+ public void setPrenominalNamedEntityCollection(boolean collectPrenominalNamedEntities);
+
+ /**
+ * Returns whether this mention finder collects pre-nominal named-entities as mentions.
+ *
+ * @return true if this mention finder collects pre-nominal named-entities as mentions
+ */
+ public boolean isPrenominalNamedEntityCollection();
+
+ /**
+ * Returns whether this mention finder collects coordinated noun phrases as mentions.
+ *
+ * @return true if this mention finder collects coordinated noun phrases as mentions; false otherwise.
+ */
+ public boolean isCoordinatedNounPhraseCollection();
+
+ /**
+ * Specifies whether coordinated noun phrases should be collected as mentions.
+ *
+ * @param collectCoordinatedNounPhrases true if coordinated noun phrases should be collected; false otherwise.
+ */
+ public void setCoordinatedNounPhraseCollection(boolean collectCoordinatedNounPhrases);
+
+ /**
+ * Returns an array of mentions.
+ *
+ * @param parse A top level parse from which mentions are gathered.
+ *
+ * @return an array of mentions which implement the <code>Extent</code> interface.
+ */
+ public Mention[] getMentions(Parse parse);
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/PTBHeadFinder.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/PTBHeadFinder.java
new file mode 100644
index 0000000..723dca8
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/PTBHeadFinder.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+
+/**
+ * Finds head information from Penn Treebank style parses.
+ */
+public final class PTBHeadFinder implements HeadFinder {
+
+ private static PTBHeadFinder instance;
+ private static Set<String> skipSet = new HashSet<String>();
+ static {
+ skipSet.add("POS");
+ skipSet.add(",");
+ skipSet.add(":");
+ skipSet.add(".");
+ skipSet.add("''");
+ skipSet.add("-RRB-");
+ skipSet.add("-RCB-");
+ }
+
+ private PTBHeadFinder() {}
+
+ /**
+ * Returns an instance of this head finder.
+ * @return an instance of this head finder.
+ */
+ public static HeadFinder getInstance() {
+ if (instance == null) {
+ instance = new PTBHeadFinder();
+ }
+ return instance;
+ }
+
+ public Parse getHead(Parse p) {
+ if (p == null) {
+ return null;
+ }
+ if (p.isNounPhrase()) {
+ List<Parse> parts = p.getSyntacticChildren();
+ //shallow parse POS
+ if (parts.size() > 2) {
+ Parse child0 = parts.get(0);
+ Parse child1 = parts.get(1);
+ Parse child2 = parts.get(2);
+ if (child1.isToken() && child1.getSyntacticType().equals("POS") && child0.isNounPhrase() && child2.isNounPhrase()) {
+ return child2;
+ }
+ }
+ //full parse POS
+ if (parts.size() > 1) {
+ Parse child0 = parts.get(0);
+ if (child0.isNounPhrase()) {
+ List<Parse> ctoks = child0.getTokens();
+ if (ctoks.size() == 0) {
+ System.err.println("PTBHeadFinder: NP "+child0+" with no tokens");
+ }
+ Parse tok = ctoks.get(ctoks.size() - 1);
+ if (tok.getSyntacticType().equals("POS")) {
+ return null;
+ }
+ }
+ }
+ //coordinated nps are their own entities
+ if (parts.size() > 1) {
+ for (int pi = 1; pi < parts.size() - 1; pi++) {
+ Parse child = parts.get(pi);
+ if (child.isToken() && child.getSyntacticType().equals("CC")) {
+ return null;
+ }
+ }
+ }
+ //all other NPs
+ for (int pi = 0; pi < parts.size(); pi++) {
+ Parse child = parts.get(pi);
+ //System.err.println("PTBHeadFinder.getHead: "+p.getSyntacticType()+" "+p+" child "+pi+"="+child.getSyntacticType()+" "+child);
+ if (child.isNounPhrase()) {
+ return child;
+ }
+ }
+ return null;
+ }
+ else {
+ return null;
+ }
+ }
+
+ public int getHeadIndex(Parse p) {
+ List<Parse> sChildren = p.getSyntacticChildren();
+ boolean countTokens = false;
+ int tokenCount = 0;
+ //check for NP -> NN S type structures and return last token before S as head.
+ for (int sci=0,scn = sChildren.size();sci<scn;sci++) {
+ Parse sc = sChildren.get(sci);
+ //System.err.println("PTBHeadFinder.getHeadIndex "+p+" "+p.getSyntacticType()+" sChild "+sci+" type = "+sc.getSyntacticType());
+ if (sc.getSyntacticType().startsWith("S")) {
+ if (sci != 0) {
+ countTokens = true;
+ }
+ else {
+ //System.err.println("PTBHeadFinder.getHeadIndex(): NP -> S production assuming right-most head");
+ }
+ }
+ if (countTokens) {
+ tokenCount+=sc.getTokens().size();
+ }
+ }
+ List<Parse> toks = p.getTokens();
+ if (toks.size() == 0) {
+ System.err.println("PTBHeadFinder.getHeadIndex(): empty tok list for parse "+p);
+ }
+ for (int ti = toks.size() - tokenCount -1; ti >= 0; ti--) {
+ Parse tok = toks.get(ti);
+ if (!skipSet.contains(tok.getSyntacticType())) {
+ return ti;
+ }
+ }
+ //System.err.println("PTBHeadFinder.getHeadIndex: "+p+" hi="+toks.size()+"-"+tokenCount+" -1 = "+(toks.size()-tokenCount -1));
+ return toks.size() - tokenCount -1;
+ }
+
+ /** Returns the bottom-most head of a <code>Parse</code>. If no
+ head is available which is a child of <code>p</code> then
+ <code>p</code> is returned. */
+ public Parse getLastHead(Parse p) {
+ Parse head;
+ //System.err.print("EntityFinder.getLastHead: "+p);
+
+ while (null != (head = getHead(p))) {
+ //System.err.print(" -> "+head);
+ //if (p.getEntityId() != -1 && head.getEntityId() != p.getEntityId()) { System.err.println(p+" ("+p.getEntityId()+") -> "+head+" ("+head.getEntityId()+")"); }
+ p = head;
+ }
+ //System.err.println(" -> null");
+ return p;
+ }
+
+ public Parse getHeadToken(Parse p) {
+ List<Parse> toks = p.getTokens();
+ return toks.get(getHeadIndex(p));
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/PTBMentionFinder.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/PTBMentionFinder.java
new file mode 100644
index 0000000..c51e336
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/PTBMentionFinder.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+/**
+ * Finds mentions from Penn Treebank style parses.
+ */
+public class PTBMentionFinder extends AbstractMentionFinder {
+
+ private static PTBMentionFinder instance = null;
+
+ /**
+ * Creates a new mention finder with the specified head finder.
+ * @param hf The head finder.
+ */
+ private PTBMentionFinder(HeadFinder hf) {
+ collectPrenominalNamedEntities = false;
+ collectCoordinatedNounPhrases = true;
+ headFinder = hf;
+ }
+
+ /**
+ * Retrives the one and only existing instance.
+ *
+ * @param hf
+ * @return the one and only existing instance
+ */
+ public static PTBMentionFinder getInstance(HeadFinder hf) {
+ if (instance == null) {
+ instance = new PTBMentionFinder(hf);
+ }
+ else if (instance.headFinder != hf) {
+ instance = new PTBMentionFinder(hf);
+ }
+ return instance;
+ }
+
+
+
+
+ /*
+ private boolean isTraceNp(Parse np){
+ List sc = np.getSyntacticChildren();
+ return (sc.size() == 0);
+ }
+
+ protected List getNounPhrases(Parse p) {
+ List nps = new ArrayList(p.getNounPhrases());
+ for (int npi = 0; npi < nps.size(); npi++) {
+ Parse np = (Parse) nps.get(npi);
+ if (!isTraceNp(np)) {
+ if (np.getSyntacticChildren().size()!=0) {
+ List snps = np.getNounPhrases();
+ for (int snpi=0,snpl=snps.size();snpi<snpl;snpi++) {
+ Parse snp = (Parse) snps.get(snpi);
+ if (!snp.isParentNAC() && !isTraceNp(snp)) {
+ nps.add(snp);
+ }
+ }
+ }
+ }
+ else {
+ nps.remove(npi);
+ npi--;
+ }
+ }
+ return (nps);
+ }
+ */
+
+ /** Moves entity ids assigned to basal nps and possesives to their
+ * maximaly containing np. Also assign head information of basal
+ * noun phase to the maximally containing np.
+ * @deprecated No on uses this any more.
+ *
+ private void propigateEntityIds(Map headMap) {
+ for (Iterator ki = headMap.keySet().iterator(); ki.hasNext();) {
+ Parse np = (Parse) ki.next();
+ if (isBasalNounPhrase(np) || isPossessive(np)) {
+ int ei = np.getEntityId();
+ if (ei != -1) {
+ Parse curHead = np;
+ Parse newHead = null;
+ while ((newHead = (Parse) headMap.get(curHead)) != null) {
+ curHead.removeEntityId();
+ curHead = newHead;
+ }
+ curHead.setEntityId(ei);
+ curHead.setProperty("head", np.getSpan().toString());
+ }
+ }
+ }
+ }
+ */
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Parse.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Parse.java
new file mode 100644
index 0000000..6cc07ad
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/Parse.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+import java.util.List;
+
+import opennlp.tools.util.Span;
+
+/**
+ * Interface for syntactic and named-entity information to be used in coreference
+ * annotation.
+ */
+public interface Parse extends Comparable<Parse> {
+
+ /**
+ * Returns the index of the sentence which contains this parse.
+ *
+ * @return The index of the sentence which contains this parse.
+ */
+ public int getSentenceNumber();
+
+ /**
+ * Returns a list of the all noun phrases
+ * contained by this parse. The noun phrases in this list should
+ * also implement the {@link Parse} interface.
+ *
+ * @return a list of all the noun phrases contained by this parse.
+ */
+ public List<Parse> getNounPhrases();
+
+ /**
+ * Returns a list of all the named entities
+ * contained by this parse. The named entities in this list should
+ * also implement the {@link Parse} interface.
+ *
+ * @return a list of all the named entities contained by this parse. */
+ public List<Parse> getNamedEntities();
+
+ /**
+ * Returns a list of the children to this object. The
+ * children should also implement the {@link Parse} interface
+ * .
+ * @return a list of the children to this object.
+ * */
+ public List<Parse> getChildren();
+
+ /**
+ * Returns a list of the children to this object which are constituents or tokens. The
+ * children should also implement the {@link Parse} interface. This allows
+ * implementations which contain addition nodes for things such as semantic categories to
+ * hide those nodes from the components which only care about syntactic nodes.
+ *
+ * @return a list of the children to this object which are constituents or tokens.
+ */
+ public List<Parse> getSyntacticChildren();
+
+ /**
+ * Returns a list of the tokens contained by this object. The tokens in this list should also
+ * implement the {@link Parse} interface.
+ *
+ * @return the tokens
+ */
+ public List<Parse> getTokens();
+
+ /**
+ * Returns the syntactic type of this node. Typically this is the part-of-speech or
+ * constituent labeling.
+ *
+ * @return the syntactic type.
+ */
+ public String getSyntacticType();
+
+ /**
+ * Returns the named-entity type of this node.
+ *
+ * @return the named-entity type.
+ */
+ public String getEntityType();
+
+ /**
+ * Determines whether this has an ancestor of type NAC.
+ *
+ * @return true is this has an ancestor of type NAC, false otherwise.
+ */
+ public boolean isParentNAC();
+
+ /**
+ * Returns the parent parse of this parse node.
+ *
+ * @return the parent parse of this parse node.
+ */
+ public Parse getParent();
+
+ /**
+ * Specifies whether this parse is a named-entity.
+ *
+ * @return True if this parse is a named-entity; false otherwise.
+ */
+ public boolean isNamedEntity();
+
+ /**
+ * Specifies whether this parse is a noun phrase.
+ *
+ * @return True if this parse is a noun phrase; false otherwise.
+ */
+ public boolean isNounPhrase();
+
+ /**
+ * Specifies whether this parse is a sentence.
+ *
+ * @return True if this parse is a sentence; false otherwise.
+ */
+ public boolean isSentence();
+
+ /**
+ * Specifies whether this parse is a coordinated noun phrase.
+ *
+ * @return True if this parse is a coordinated noun phrase; false otherwise.
+ */
+ public boolean isCoordinatedNounPhrase();
+
+ /**
+ * Specifies whether this parse is a token.
+ *
+ * @return True if this parse is a token; false otherwise.
+ */
+ public boolean isToken();
+
+ public String toString();
+
+ /**
+ * Returns an entity id associated with this parse and coreferent parses. This is only used for training on
+ * already annotated coreference annotation.
+ *
+ * @return an entity id associated with this parse and coreferent parses.
+ */
+ public int getEntityId();
+
+ /**
+ * Returns the character offsets of this parse node.
+ *
+ * @return The span representing the character offsets of this parse node.
+ */
+ public Span getSpan();
+
+ /**
+ * Returns the first token which is not a child of this parse. If the first token of a sentence is
+ * a child of this parse then null is returned.
+ *
+ * @return the first token which is not a child of this parse or null if no such token exists.
+ */
+ public Parse getPreviousToken();
+
+ /**
+ * Returns the next token which is not a child of this parse. If the last token of a sentence is
+ * a child of this parse then null is returned.
+ *
+ * @return the next token which is not a child of this parse or null if no such token exists.
+ */
+ public Parse getNextToken();
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/ShallowParseMentionFinder.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/ShallowParseMentionFinder.java
new file mode 100644
index 0000000..553d2ba
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/ShallowParseMentionFinder.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.mention;
+
+/**
+ * Finds mentions from shallow np-chunking based parses.
+ */
+public class ShallowParseMentionFinder extends AbstractMentionFinder {
+
+ private static ShallowParseMentionFinder instance;
+
+ private ShallowParseMentionFinder(HeadFinder hf) {
+ headFinder = hf;
+ collectPrenominalNamedEntities=true;
+ collectCoordinatedNounPhrases=true;
+ }
+
+ /**
+ * Retrieves the one and only existing instance.
+ *
+ * @param hf
+ * @return one and only existing instance
+ */
+ public static ShallowParseMentionFinder getInstance(HeadFinder hf) {
+ if (instance == null) {
+ instance = new ShallowParseMentionFinder(hf);
+ }
+ else if (instance.headFinder != hf) {
+ instance = new ShallowParseMentionFinder(hf);
+ }
+ return instance;
+ }
+
+ /*
+ protected final List getNounPhrases(Parse p) {
+ List nps = p.getNounPhrases();
+ List basals = new ArrayList();
+ for (int ni=0,ns=nps.size();ni<ns;ni++) {
+ Parse np = (Parse) nps.get(ni);
+ //System.err.println("getNounPhrases: np="+np);
+ if (isBasalNounPhrase(np)) {
+ //System.err.println("basal");
+ basals.add(np);
+ }
+ else if (isPossessive(np)) {
+ //System.err.println("pos np");
+ basals.add(np);
+ basals.addAll(getNounPhrases(np));
+ }
+ else if (isOfPrepPhrase(np)) {
+ //System.err.println("of np");
+ basals.add(np);
+ basals.addAll(getNounPhrases(np));
+ }
+ else {
+ //System.err.println("big np");
+ basals.addAll(getNounPhrases(np));
+ }
+ }
+ return(basals);
+ }
+ */
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/package-info.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/package-info.java
new file mode 100644
index 0000000..075aae6
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Package related to the modeling mentions for coreference resolution.
+ */
+package opennlp.tools.coref.mention;
\ No newline at end of file
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/package-info.java b/opennlp-coref/src/main/java/opennlp/tools/coref/package-info.java
new file mode 100644
index 0000000..8ec4703
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Package related to performing coreference resolution.
+ */
+package opennlp.tools.coref;
\ No newline at end of file
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/AbstractResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/AbstractResolver.java
new file mode 100644
index 0000000..166b8dd
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/AbstractResolver.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.DiscourseModel;
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.mention.Parse;
+import opennlp.tools.util.CountedSet;
+
+/**
+ * Default implementation of some methods in the {@link Resolver} interface.
+ */
+public abstract class AbstractResolver implements Resolver {
+
+ /**
+ * The number of previous entities that resolver should consider.
+ */
+ protected int numEntitiesBack;
+
+ /**
+ * Debugging variable which specifies whether error output is generated
+ * if a class excludes as possibly coreferent mentions which are in-fact
+ * coreferent.
+ */
+ protected boolean showExclusions;
+
+ /**
+ * Debugging variable which holds statistics about mention distances
+ * during training.
+ */
+ protected CountedSet<Integer> distances;
+
+ /**
+ * The number of sentences back this resolver should look for a referent.
+ */
+ protected int numSentencesBack;
+
+ public AbstractResolver(int neb) {
+ numEntitiesBack=neb;
+ showExclusions = true;
+ distances = new CountedSet<Integer>();
+ }
+
+ /**
+ * Returns the number of previous entities that resolver should consider.
+ *
+ * @return the number of previous entities that resolver should consider.
+ */
+ protected int getNumEntities() {
+ return numEntitiesBack;
+ }
+
+ /**
+ * Specifies the number of sentences back this resolver should look for a referent.
+ *
+ * @param nsb the number of sentences back this resolver should look for a referent.
+ */
+ public void setNumberSentencesBack(int nsb) {
+ numSentencesBack = nsb;
+ }
+
+ /**
+ * The number of entities that should be considered for resolution with the specified discourse model.
+ *
+ * @param dm The discourse model.
+ *
+ * @return number of entities that should be considered for resolution.
+ */
+ protected int getNumEntities(DiscourseModel dm) {
+ return Math.min(dm.getNumEntities(),numEntitiesBack);
+ }
+
+ /**
+ * Returns the head parse for the specified mention.
+ *
+ * @param mention The mention.
+ *
+ * @return the head parse for the specified mention.
+ */
+ protected Parse getHead(MentionContext mention) {
+ return mention.getHeadTokenParse();
+ }
+
+ /**
+ * Returns the index for the head word for the specified mention.
+ *
+ * @param mention The mention.
+ *
+ * @return the index for the head word for the specified mention.
+ */
+ protected int getHeadIndex(MentionContext mention) {
+ Parse[] mtokens = mention.getTokenParses();
+ for (int ti=mtokens.length-1;ti>=0;ti--) {
+ Parse tok = mtokens[ti];
+ if (!tok.getSyntacticType().equals("POS") && !tok.getSyntacticType().equals(",") &&
+ !tok.getSyntacticType().equals(".")) {
+ return ti;
+ }
+ }
+ return mtokens.length-1;
+ }
+
+ /**
+ * Returns the text of the head word for the specified mention.
+ *
+ * @param mention The mention.
+ *
+ * @return The text of the head word for the specified mention.
+ */
+ protected String getHeadString(MentionContext mention) {
+ return mention.getHeadTokenText().toLowerCase();
+ }
+
+ /**
+ * Determines if the specified entity is too far from the specified mention to be resolved to it.
+ * Once an entity has been determined to be out of range subsequent entities are not considered.
+ * To skip intermediate entities @see excluded.
+ *
+ * @param mention The mention which is being considered.
+ * @param entity The entity to which the mention is to be resolved.
+ *
+ * @return true is the entity is in range of the mention, false otherwise.
+ */
+ protected boolean outOfRange(MentionContext mention, DiscourseEntity entity) {
+ return false;
+ }
+
+ /**
+ * Excludes entities which you are not compatible with the entity under consideration. The default
+ * implementation excludes entities whose last extent contains the extent under consideration.
+ * This prevents possessive pronouns from referring to the noun phrases they modify and other
+ * undesirable things.
+ *
+ * @param mention The mention which is being considered as referential.
+ * @param entity The entity to which the mention is to be resolved.
+ *
+ * @return true if the entity should be excluded, false otherwise.
+ */
+ protected boolean excluded(MentionContext mention, DiscourseEntity entity) {
+ MentionContext cec = entity.getLastExtent();
+ return mention.getSentenceNumber() == cec.getSentenceNumber() &&
+ mention.getIndexSpan().getEnd() <= cec.getIndexSpan().getEnd();
+ }
+
+ public DiscourseEntity retain(MentionContext mention, DiscourseModel dm) {
+ int ei = 0;
+ if (mention.getId() == -1) {
+ return null;
+ }
+ for (; ei < dm.getNumEntities(); ei++) {
+ DiscourseEntity cde = dm.getEntity(ei);
+ MentionContext cec = cde.getLastExtent(); // candidate extent context
+ if (cec.getId() == mention.getId()) {
+ distances.add(ei);
+ return cde;
+ }
+ }
+ //System.err.println("AbstractResolver.retain: non-refering entity with id: "+ec.toText()+" id="+ec.id);
+ return null;
+ }
+
+ /**
+ * Returns the string of "_" delimited tokens for the specified mention.
+ *
+ * @param mention The mention.
+ *
+ * @return the string of "_" delimited tokens for the specified mention.
+ */
+ protected String featureString(MentionContext mention){
+ StringBuilder fs = new StringBuilder();
+ Object[] mtokens =mention.getTokens();
+ fs.append(mtokens[0].toString());
+ for (int ti=1,tl=mtokens.length;ti<tl;ti++) {
+ fs.append("_").append(mtokens[ti].toString());
+ }
+ return fs.toString();
+ }
+
+
+ public void train() throws IOException {};
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/CommonNounResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/CommonNounResolver.java
new file mode 100644
index 0000000..ab2497a
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/CommonNounResolver.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Resolves coreference between common nouns.
+ */
+public class CommonNounResolver extends MaxentResolver {
+
+ public CommonNounResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName,"cmodel", m, 80, true);
+ showExclusions = false;
+ preferFirstReferent = true;
+ }
+
+ public CommonNounResolver(String projectName, ResolverMode m, NonReferentialResolver nrr) throws IOException {
+ super(projectName,"cmodel", m, 80, true,nrr);
+ showExclusions = false;
+ preferFirstReferent = true;
+ }
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention, entity));
+ if (entity != null) {
+ features.addAll(ResolverUtils.getContextFeatures(mention));
+ features.addAll(ResolverUtils.getStringMatchFeatures(mention,entity));
+ }
+ return features;
+ }
+
+ public boolean canResolve(MentionContext mention) {
+ String firstTok = mention.getFirstTokenText().toLowerCase();
+ String firstTokTag = mention.getFirstToken().getSyntacticType();
+ boolean rv = mention.getHeadTokenTag().equals("NN") && !ResolverUtils.definiteArticle(firstTok, firstTokTag);
+ return rv;
+ }
+
+ @Override
+ protected boolean excluded(MentionContext ec, DiscourseEntity de) {
+ if (super.excluded(ec, de)) {
+ return true;
+ }
+ else {
+ MentionContext cec = de.getLastExtent();
+ return !canResolve(cec) || super.excluded(ec, de);
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefaultNonReferentialResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefaultNonReferentialResolver.java
new file mode 100644
index 0000000..1f3b8c6
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefaultNonReferentialResolver.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.maxent.GIS;
+import opennlp.maxent.io.BinaryGISModelReader;
+import opennlp.maxent.io.SuffixSensitiveGISModelReader;
+import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
+import opennlp.model.Event;
+import opennlp.model.MaxentModel;
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.mention.Parse;
+import opennlp.tools.util.CollectionEventStream;
+
+/**
+ * Default implementation of the {@link NonReferentialResolver} interface.
+ */
+public class DefaultNonReferentialResolver implements NonReferentialResolver {
+
+ private MaxentModel model;
+ private List<Event> events;
+ private boolean loadAsResource;
+ private boolean debugOn = false;
+ private ResolverMode mode;
+ private String modelName;
+ private String modelExtension = ".bin.gz";
+ private int nonRefIndex;
+
+ public DefaultNonReferentialResolver(String projectName, String name, ResolverMode mode) throws IOException {
+ this.mode = mode;
+ this.modelName = projectName+"/"+name+".nr";
+ if (mode == ResolverMode.TRAIN) {
+ events = new ArrayList<Event>();
+ }
+ else if (mode == ResolverMode.TEST) {
+ if (loadAsResource) {
+ model = (new BinaryGISModelReader(new DataInputStream(this.getClass().getResourceAsStream(modelName)))).getModel();
+ }
+ else {
+ model = (new SuffixSensitiveGISModelReader(new File(modelName+modelExtension))).getModel();
+ }
+ nonRefIndex = model.getIndex(MaxentResolver.SAME);
+ }
+ else {
+ throw new RuntimeException("unexpected mode "+mode);
+ }
+ }
+
+ public double getNonReferentialProbability(MentionContext mention) {
+ List<String> features = getFeatures(mention);
+ double r = model.eval(features.toArray(new String[features.size()]))[nonRefIndex];
+ if (debugOn) System.err.println(this +" " + mention.toText() + " -> null " + r + " " + features);
+ return r;
+ }
+
+ public void addEvent(MentionContext ec) {
+ List<String> features = getFeatures(ec);
+ if (-1 == ec.getId()) {
+ events.add(new Event(MaxentResolver.SAME, features.toArray(new String[features.size()])));
+ }
+ else {
+ events.add(new Event(MaxentResolver.DIFF, features.toArray(new String[features.size()])));
+ }
+ }
+
+ protected List<String> getFeatures(MentionContext mention) {
+ List<String> features = new ArrayList<String>();
+ features.add(MaxentResolver.DEFAULT);
+ features.addAll(getNonReferentialFeatures(mention));
+ return features;
+ }
+
+ /**
+ * Returns a list of features used to predict whether the specified mention is non-referential.
+ * @param mention The mention under consideration.
+ * @return a list of features used to predict whether the specified mention is non-referential.
+ */
+ protected List<String> getNonReferentialFeatures(MentionContext mention) {
+ List<String> features = new ArrayList<String>();
+ Parse[] mtokens = mention.getTokenParses();
+ //System.err.println("getNonReferentialFeatures: mention has "+mtokens.length+" tokens");
+ for (int ti = 0; ti <= mention.getHeadTokenIndex(); ti++) {
+ Parse tok = mtokens[ti];
+ List<String> wfs = ResolverUtils.getWordFeatures(tok);
+ for (int wfi = 0; wfi < wfs.size(); wfi++) {
+ features.add("nr" + wfs.get(wfi));
+ }
+ }
+ features.addAll(ResolverUtils.getContextFeatures(mention));
+ return features;
+ }
+
+ public void train() throws IOException {
+ if (ResolverMode.TRAIN == mode) {
+ System.err.println(this +" referential");
+ if (debugOn) {
+ FileWriter writer = new FileWriter(modelName+".events");
+ for (Iterator<Event> ei=events.iterator();ei.hasNext();) {
+ Event e = ei.next();
+ writer.write(e.toString()+"\n");
+ }
+ writer.close();
+ }
+ (new SuffixSensitiveGISModelWriter(GIS.trainModel(new CollectionEventStream(events),100,10),new File(modelName+modelExtension))).persist();
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefiniteNounResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefiniteNounResolver.java
new file mode 100644
index 0000000..c64121d
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefiniteNounResolver.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Resolves coreference between definite noun-phrases.
+ */
+public class DefiniteNounResolver extends MaxentResolver {
+
+ public DefiniteNounResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName, "defmodel", m, 80);
+ //preferFirstReferent = true;
+ }
+
+ public DefiniteNounResolver(String projectName, ResolverMode m, NonReferentialResolver nrr) throws IOException {
+ super(projectName, "defmodel", m, 80,nrr);
+ //preferFirstReferent = true;
+ }
+
+
+ public boolean canResolve(MentionContext mention) {
+ Object[] mtokens = mention.getTokens();
+
+ String firstTok = mention.getFirstTokenText().toLowerCase();
+ boolean rv = mtokens.length > 1 && !mention.getHeadTokenTag().startsWith("NNP") && ResolverUtils.definiteArticle(firstTok, mention.getFirstTokenTag());
+ //if (rv) {
+ // System.err.println("defNp "+ec);
+ //}
+ return (rv);
+ }
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention, entity));
+ if (entity != null) {
+ features.addAll(ResolverUtils.getContextFeatures(mention));
+ features.addAll(ResolverUtils.getStringMatchFeatures(mention,entity));
+ features.addAll(ResolverUtils.getDistanceFeatures(mention,entity));
+ }
+ return (features);
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/FixedNonReferentialResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/FixedNonReferentialResolver.java
new file mode 100644
index 0000000..a911fac
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/FixedNonReferentialResolver.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Implementation of non-referential classifier which uses a fixed-value threshold.
+ */
+public class FixedNonReferentialResolver implements NonReferentialResolver {
+
+ private double nonReferentialProbability;
+
+ public FixedNonReferentialResolver(double nonReferentialProbability) {
+ this.nonReferentialProbability = nonReferentialProbability;
+ }
+
+ public double getNonReferentialProbability(MentionContext mention) {
+ return this.nonReferentialProbability;
+ }
+
+ public void addEvent(MentionContext mention) {}
+
+ public void train() throws IOException {}
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/IsAResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/IsAResolver.java
new file mode 100644
index 0000000..37629d3
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/IsAResolver.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Resolves coreference between appositives.
+ */
+public class IsAResolver extends MaxentResolver {
+
+ Pattern predicativePattern;
+
+ public IsAResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName, "/imodel", m, 20);
+ showExclusions = false;
+ //predicativePattern = Pattern.compile("^(,|am|are|is|was|were|--)$");
+ predicativePattern = Pattern.compile("^(,|--)$");
+ }
+
+ public IsAResolver(String projectName, ResolverMode m, NonReferentialResolver nrr) throws IOException {
+ super(projectName, "/imodel", m, 20,nrr);
+ showExclusions = false;
+ //predicativePattern = Pattern.compile("^(,|am|are|is|was|were|--)$");
+ predicativePattern = Pattern.compile("^(,|--)$");
+ }
+
+
+ public boolean canResolve(MentionContext ec) {
+ if (ec.getHeadTokenTag().startsWith("NN")) {
+ return (ec.getPreviousToken() != null && predicativePattern.matcher(ec.getPreviousToken().toString()).matches());
+ }
+ return false;
+ }
+
+ @Override
+ protected boolean excluded(MentionContext ec, DiscourseEntity de) {
+ MentionContext cec = de.getLastExtent();
+ //System.err.println("IsAResolver.excluded?: ec.span="+ec.getSpan()+" cec.span="+cec.getSpan()+" cec="+cec.toText()+" lastToken="+ec.getNextToken());
+ if (ec.getSentenceNumber() != cec.getSentenceNumber()) {
+ //System.err.println("IsAResolver.excluded: (true) not same sentence");
+ return (true);
+ }
+ //shallow parse appositives
+ //System.err.println("IsAResolver.excluded: ec="+ec.toText()+" "+ec.span+" cec="+cec.toText()+" "+cec.span);
+ if (cec.getIndexSpan().getEnd() == ec.getIndexSpan().getStart() - 2) {
+ return (false);
+ }
+ //full parse w/o trailing comma
+ if (cec.getIndexSpan().getEnd() == ec.getIndexSpan().getEnd()) {
+ //System.err.println("IsAResolver.excluded: (false) spans share end");
+ return (false);
+ }
+ //full parse w/ trailing comma or period
+ if (cec.getIndexSpan().getEnd() <= ec.getIndexSpan().getEnd() + 2 && (ec.getNextToken() != null && (ec.getNextToken().toString().equals(",") || ec.getNextToken().toString().equals(".")))) {
+ //System.err.println("IsAResolver.excluded: (false) spans end + punct");
+ return (false);
+ }
+ //System.err.println("IsAResolver.excluded: (true) default");
+ return (true);
+ }
+
+ @Override
+ protected boolean outOfRange(MentionContext ec, DiscourseEntity de) {
+ MentionContext cec = de.getLastExtent();
+ return (cec.getSentenceNumber() != ec.getSentenceNumber());
+ }
+
+ @Override
+ protected boolean defaultReferent(DiscourseEntity de) {
+ return (true);
+ }
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention, entity));
+ if (entity != null) {
+ MentionContext ant = entity.getLastExtent();
+ List<String> leftContexts = ResolverUtils.getContextFeatures(ant);
+ for (int ci = 0, cn = leftContexts.size(); ci < cn; ci++) {
+ features.add("l" + leftContexts.get(ci));
+ }
+ List<String> rightContexts = ResolverUtils.getContextFeatures(mention);
+ for (int ci = 0, cn = rightContexts.size(); ci < cn; ci++) {
+ features.add("r" + rightContexts.get(ci));
+ }
+ features.add("hts"+ant.getHeadTokenTag()+","+mention.getHeadTokenTag());
+ }
+ /*
+ if (entity != null) {
+ //System.err.println("MaxentIsResolver.getFeatures: ["+ec2.toText()+"] -> ["+de.getLastExtent().toText()+"]");
+ //previous word and tag
+ if (ant.prevToken != null) {
+ features.add("pw=" + ant.prevToken);
+ features.add("pt=" + ant.prevToken.getSyntacticType());
+ }
+ else {
+ features.add("pw=<none>");
+ features.add("pt=<none>");
+ }
+
+ //next word and tag
+ if (mention.nextToken != null) {
+ features.add("nw=" + mention.nextToken);
+ features.add("nt=" + mention.nextToken.getSyntacticType());
+ }
+ else {
+ features.add("nw=<none>");
+ features.add("nt=<none>");
+ }
+
+ //modifier word and tag for c1
+ int i = 0;
+ List c1toks = ant.tokens;
+ for (; i < ant.headTokenIndex; i++) {
+ features.add("mw=" + c1toks.get(i));
+ features.add("mt=" + ((Parse) c1toks.get(i)).getSyntacticType());
+ }
+ //head word and tag for c1
+ features.add("mh=" + c1toks.get(i));
+ features.add("mt=" + ((Parse) c1toks.get(i)).getSyntacticType());
+
+ //modifier word and tag for c2
+ i = 0;
+ List c2toks = mention.tokens;
+ for (; i < mention.headTokenIndex; i++) {
+ features.add("mw=" + c2toks.get(i));
+ features.add("mt=" + ((Parse) c2toks.get(i)).getSyntacticType());
+ }
+ //head word and tag for n2
+ features.add("mh=" + c2toks.get(i));
+ features.add("mt=" + ((Parse) c2toks.get(i)).getSyntacticType());
+
+ //word/tag pairs
+ for (i = 0; i < ant.headTokenIndex; i++) {
+ for (int j = 0; j < mention.headTokenIndex; j++) {
+ features.add("w=" + c1toks.get(i) + "|" + "w=" + c2toks.get(j));
+ features.add("w=" + c1toks.get(i) + "|" + "t=" + ((Parse) c2toks.get(j)).getSyntacticType());
+ features.add("t=" + ((Parse) c1toks.get(i)).getSyntacticType() + "|" + "w=" + c2toks.get(j));
+ features.add("t=" + ((Parse) c1toks.get(i)).getSyntacticType() + "|" + "t=" + ((Parse) c2toks.get(j)).getSyntacticType());
+ }
+ }
+ features.add("ht=" + ant.headTokenTag + "|" + "ht=" + mention.headTokenTag);
+ features.add("ht1=" + ant.headTokenTag);
+ features.add("ht2=" + mention.headTokenTag);
+ */
+ //semantic categories
+ /*
+ if (ant.neType != null) {
+ if (re.neType != null) {
+ features.add("sc="+ant.neType+","+re.neType);
+ }
+ else if (!re.headTokenTag.startsWith("NNP") && re.headTokenTag.startsWith("NN")) {
+ Set synsets = re.synsets;
+ for (Iterator si=synsets.iterator();si.hasNext();) {
+ features.add("sc="+ant.neType+","+si.next());
+ }
+ }
+ }
+ else if (!ant.headTokenTag.startsWith("NNP") && ant.headTokenTag.startsWith("NN")) {
+ if (re.neType != null) {
+ Set synsets = ant.synsets;
+ for (Iterator si=synsets.iterator();si.hasNext();) {
+ features.add("sc="+re.neType+","+si.next());
+ }
+ }
+ else if (!re.headTokenTag.startsWith("NNP") && re.headTokenTag.startsWith("NN")) {
+ //System.err.println("MaxentIsaResolover.getFeatures: both common re="+re.parse+" ant="+ant.parse);
+ Set synsets1 = ant.synsets;
+ Set synsets2 = re.synsets;
+ for (Iterator si=synsets1.iterator();si.hasNext();) {
+ Object synset = si.next();
+ if (synsets2.contains(synset)) {
+ features.add("sc="+synset);
+ }
+ }
+ }
+ }
+ }
+ */
+ //System.err.println("MaxentIsResolver.getFeatures: "+features.toString());
+ return (features);
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/MaxentResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/MaxentResolver.java
new file mode 100644
index 0000000..a15f67d
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/MaxentResolver.java
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.maxent.GIS;
+import opennlp.maxent.io.SuffixSensitiveGISModelReader;
+import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
+import opennlp.model.Event;
+import opennlp.model.MaxentModel;
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.DiscourseModel;
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.sim.TestSimilarityModel;
+import opennlp.tools.util.CollectionEventStream;
+
+/**
+ * Provides common functionality used by classes which implement the {@link Resolver} class and use maximum entropy models to make resolution decisions.
+ */
+public abstract class MaxentResolver extends AbstractResolver {
+
+ /** Outcomes when two mentions are coreferent. */
+ public static final String SAME = "same";
+ /** Outcome when two mentions are not coreferent. */
+ public static final String DIFF = "diff";
+ /** Default feature value. */
+ public static final String DEFAULT = "default";
+
+
+ private static boolean debugOn=false;
+
+ private String modelName;
+ private MaxentModel model;
+ private double[] candProbs;
+ private int sameIndex;
+ private ResolverMode mode;
+ private List<Event> events;
+
+ /** When true, this designates that the resolver should use the first referent encountered which it
+ * more preferable than non-reference. When false all non-excluded referents within this resolvers range
+ * are considered.
+ */
+ protected boolean preferFirstReferent;
+ /** When true, this designates that training should consist of a single positive and a single negative example
+ * (when possible) for each mention. */
+ protected boolean pairedSampleSelection;
+
+ /** When true, this designates that the same maximum entropy model should be used non-reference
+ * events (the pairing of a mention and the "null" reference) as is used for potentially
+ * referential pairs. When false a separate model is created for these events.
+ */
+ protected boolean useSameModelForNonRef;
+
+ private static TestSimilarityModel simModel = null;
+
+ /** The model for computing non-referential probabilities. */
+ protected NonReferentialResolver nonReferentialResolver;
+
+ private static final String modelExtension = ".bin.gz";
+
+ /**
+ * Creates a maximum-entropy-based resolver which will look the specified number of entities back for a referent.
+ * This constructor is only used for unit testing.
+ * @param numberOfEntitiesBack
+ * @param preferFirstReferent
+ */
+ protected MaxentResolver(int numberOfEntitiesBack, boolean preferFirstReferent) {
+ super(numberOfEntitiesBack);
+ this.preferFirstReferent = preferFirstReferent;
+ }
+
+
+ /**
+ * Creates a maximum-entropy-based resolver with the specified model name, using the
+ * specified mode, which will look the specified number of entities back for a referent and
+ * prefer the first referent if specified.
+ * @param modelDirectory The name of the directory where the resolver models are stored.
+ * @param name The name of the file where this model will be read or written.
+ * @param mode The mode this resolver is being using in (training, testing).
+ * @param numberOfEntitiesBack The number of entities back in the text that this resolver will look
+ * for a referent.
+ * @param preferFirstReferent Set to true if the resolver should prefer the first referent which is more
+ * likely than non-reference. This only affects testing.
+ * @param nonReferentialResolver Determines how likely it is that this entity is non-referential.
+ * @throws IOException If the model file is not found or can not be written to.
+ */
+ public MaxentResolver(String modelDirectory, String name, ResolverMode mode, int numberOfEntitiesBack, boolean preferFirstReferent, NonReferentialResolver nonReferentialResolver) throws IOException {
+ super(numberOfEntitiesBack);
+ this.preferFirstReferent = preferFirstReferent;
+ this.nonReferentialResolver = nonReferentialResolver;
+ this.mode = mode;
+ this.modelName = modelDirectory+"/"+name;
+ if (ResolverMode.TEST == this.mode) {
+ model = (new SuffixSensitiveGISModelReader(new File(modelName+modelExtension))).getModel();
+ sameIndex = model.getIndex(SAME);
+ }
+ else if (ResolverMode.TRAIN == this.mode) {
+ events = new ArrayList<Event>();
+ }
+ else {
+ System.err.println("Unknown mode: " + this.mode);
+ }
+ //add one for non-referent possibility
+ candProbs = new double[getNumEntities() + 1];
+ }
+
+ /**
+ * Creates a maximum-entropy-based resolver with the specified model name, using the
+ * specified mode, which will look the specified number of entities back for a referent.
+ * @param modelDirectory The name of the directory where the resover models are stored.
+ * @param modelName The name of the file where this model will be read or written.
+ * @param mode The mode this resolver is being using in (training, testing).
+ * @param numberEntitiesBack The number of entities back in the text that this resolver will look
+ * for a referent.
+ * @throws IOException If the model file is not found or can not be written to.
+ */
+ public MaxentResolver(String modelDirectory, String modelName, ResolverMode mode, int numberEntitiesBack) throws IOException {
+ this(modelDirectory, modelName, mode, numberEntitiesBack, false);
+ }
+
+ public MaxentResolver(String modelDirectory, String modelName, ResolverMode mode, int numberEntitiesBack, NonReferentialResolver nonReferentialResolver) throws IOException {
+ this(modelDirectory, modelName, mode, numberEntitiesBack, false,nonReferentialResolver);
+ }
+
+ public MaxentResolver(String modelDirectory, String modelName, ResolverMode mode, int numberEntitiesBack, boolean preferFirstReferent) throws IOException {
+ //this(projectName, modelName, mode, numberEntitiesBack, preferFirstReferent, SingletonNonReferentialResolver.getInstance(projectName,mode));
+ this(modelDirectory, modelName, mode, numberEntitiesBack, preferFirstReferent, new DefaultNonReferentialResolver(modelDirectory, modelName, mode));
+ }
+
+ public MaxentResolver(String modelDirectory, String modelName, ResolverMode mode, int numberEntitiesBack, boolean preferFirstReferent, double nonReferentialProbability) throws IOException {
+ //this(projectName, modelName, mode, numberEntitiesBack, preferFirstReferent, SingletonNonReferentialResolver.getInstance(projectName,mode));
+ this(modelDirectory, modelName, mode, numberEntitiesBack, preferFirstReferent, new FixedNonReferentialResolver(nonReferentialProbability));
+ }
+
+ public DiscourseEntity resolve(MentionContext ec, DiscourseModel dm) {
+ DiscourseEntity de;
+ int ei = 0;
+ double nonReferentialProbability = nonReferentialResolver.getNonReferentialProbability(ec);
+ if (debugOn) {
+ System.err.println(this +".resolve: " + ec.toText() + " -> " + "null "+nonReferentialProbability);
+ }
+ for (; ei < getNumEntities(dm); ei++) {
+ de = dm.getEntity(ei);
+ if (outOfRange(ec, de)) {
+ break;
+ }
+ if (excluded(ec, de)) {
+ candProbs[ei] = 0;
+ if (debugOn) {
+ System.err.println("excluded "+this +".resolve: " + ec.toText() + " -> " + de + " " + candProbs[ei]);
+ }
+ }
+ else {
+
+ List<String> lfeatures = getFeatures(ec, de);
+ String[] features = lfeatures.toArray(new String[lfeatures.size()]);
+ try {
+ candProbs[ei] = model.eval(features)[sameIndex];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ candProbs[ei] = 0;
+ }
+ if (debugOn) {
+ System.err.println(this +".resolve: " + ec.toText() + " -> " + de + " ("+ec.getGender()+","+de.getGender()+") " + candProbs[ei] + " " + lfeatures);
+ }
+ }
+ if (preferFirstReferent && candProbs[ei] > nonReferentialProbability) {
+ ei++; //update for nonRef assignment
+ break;
+ }
+ }
+ candProbs[ei] = nonReferentialProbability;
+
+ // find max
+ int maxCandIndex = 0;
+ for (int k = 1; k <= ei; k++) {
+ if (candProbs[k] > candProbs[maxCandIndex]) {
+ maxCandIndex = k;
+ }
+ }
+ if (maxCandIndex == ei) { // no referent
+ return (null);
+ }
+ else {
+ de = dm.getEntity(maxCandIndex);
+ return (de);
+ }
+ }
+
+
+ /**
+ * Returns whether the specified entity satisfies the criteria for being a default referent.
+ * This criteria is used to perform sample selection on the training data and to select a single
+ * non-referent entity. Typically the criteria is a heuristic for a likely referent.
+ * @param de The discourse entity being considered for non-reference.
+ * @return True if the entity should be used as a default referent, false otherwise.
+ */
+ protected boolean defaultReferent(DiscourseEntity de) {
+ MentionContext ec = de.getLastExtent();
+ if (ec.getNounPhraseSentenceIndex() == 0) {
+ return (true);
+ }
+ return (false);
+ }
+
+ @Override
+ public DiscourseEntity retain(MentionContext mention, DiscourseModel dm) {
+ //System.err.println(this+".retain("+ec+") "+mode);
+ if (ResolverMode.TRAIN == mode) {
+ DiscourseEntity de = null;
+ boolean referentFound = false;
+ boolean hasReferentialCandidate = false;
+ boolean nonReferentFound = false;
+ for (int ei = 0; ei < getNumEntities(dm); ei++) {
+ DiscourseEntity cde = dm.getEntity(ei);
+ MentionContext entityMention = cde.getLastExtent();
+ if (outOfRange(mention, cde)) {
+ if (mention.getId() != -1 && !referentFound) {
+ //System.err.println("retain: Referent out of range: "+ec.toText()+" "+ec.parse.getSpan());
+ }
+ break;
+ }
+ if (excluded(mention, cde)) {
+ if (showExclusions) {
+ if (mention.getId() != -1 && entityMention.getId() == mention.getId()) {
+ System.err.println(this +".retain: Referent excluded: (" + mention.getId() + ") " + mention.toText() + " " + mention.getIndexSpan() + " -> (" + entityMention.getId() + ") " + entityMention.toText() + " " + entityMention.getSpan() + " " + this);
+ }
+ }
+ }
+ else {
+ hasReferentialCandidate = true;
+ boolean useAsDifferentExample = defaultReferent(cde);
+ //if (!sampleSelection || (mention.getId() != -1 && entityMention.getId() == mention.getId()) || (!nonReferentFound && useAsDifferentExample)) {
+ List<String> features = getFeatures(mention, cde);
+
+ //add Event to Model
+ if (debugOn) {
+ System.err.println(this +".retain: " + mention.getId() + " " + mention.toText() + " -> " + entityMention.getId() + " " + cde);
+ }
+ if (mention.getId() != -1 && entityMention.getId() == mention.getId()) {
+ referentFound = true;
+ events.add(new Event(SAME, features.toArray(new String[features.size()])));
+ de = cde;
+ //System.err.println("MaxentResolver.retain: resolved at "+ei);
+ distances.add(ei);
+ }
+ else if (!pairedSampleSelection || (!nonReferentFound && useAsDifferentExample)) {
+ nonReferentFound = true;
+ events.add(new Event(DIFF, features.toArray(new String[features.size()])));
+ }
+ //}
+ }
+ if (pairedSampleSelection && referentFound && nonReferentFound) {
+ break;
+ }
+ if (preferFirstReferent && referentFound) {
+ break;
+ }
+ }
+ // doesn't refer to anything
+ if (hasReferentialCandidate) {
+ nonReferentialResolver.addEvent(mention);
+ }
+ return (de);
+ }
+ else {
+ return (super.retain(mention, dm));
+ }
+ }
+
+ /**
+ * Returns a list of features for deciding whether the specified mention refers to the specified discourse entity.
+ * @param mention the mention being considers as possibly referential.
+ * @param entity The discourse entity with which the mention is being considered referential.
+ * @return a list of features used to predict reference between the specified mention and entity.
+ */
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.add(DEFAULT);
+ features.addAll(ResolverUtils.getCompatibilityFeatures(mention, entity,simModel));
+ return features;
+ }
+
+ @Override
+ public void train() throws IOException {
+ if (ResolverMode.TRAIN == mode) {
+ if (debugOn) {
+ System.err.println(this +" referential");
+ FileWriter writer = new FileWriter(modelName+".events");
+ for (Iterator<Event> ei=events.iterator();ei.hasNext();) {
+ Event e = ei.next();
+ writer.write(e.toString()+"\n");
+ }
+ writer.close();
+ }
+ (new SuffixSensitiveGISModelWriter(GIS.trainModel(new CollectionEventStream(events),100,10),new File(modelName+modelExtension))).persist();
+ nonReferentialResolver.train();
+ }
+ }
+
+ public static void setSimilarityModel(TestSimilarityModel sm) {
+ simModel = sm;
+ }
+
+ @Override
+ protected boolean excluded(MentionContext ec, DiscourseEntity de) {
+ if (super.excluded(ec, de)) {
+ return true;
+ }
+ return false;
+ /*
+ else {
+ if (GEN_INCOMPATIBLE == getGenderCompatibilityFeature(ec,de)) {
+ return true;
+ }
+ else if (NUM_INCOMPATIBLE == getNumberCompatibilityFeature(ec,de)) {
+ return true;
+ }
+ else if (SIM_INCOMPATIBLE == getSemanticCompatibilityFeature(ec,de)) {
+ return true;
+ }
+ return false;
+ }
+ */
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/NonReferentialResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/NonReferentialResolver.java
new file mode 100644
index 0000000..d042188
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/NonReferentialResolver.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Provides the interface for a object to provide a resolver with a non-referential
+ * probability. Non-referential resolvers compute the probability that a particular mention refers
+ * to no antecedent. This probability can then compete with the probability that
+ * a mention refers with a specific antecedent.
+ */
+public interface NonReferentialResolver {
+
+ /**
+ * Returns the probability that the specified mention doesn't refer to any previous mention.
+ *
+ * @param mention The mention under consideration.
+ * @return A probability that the specified mention doesn't refer to any previous mention.
+ */
+ public double getNonReferentialProbability(MentionContext mention);
+
+ /**
+ * Designates that the specified mention be used for training.
+ *
+ * @param mention The mention to be used. The mention id is used to determine
+ * whether this mention is referential or non-referential.
+ */
+ public void addEvent(MentionContext mention);
+
+ /**
+ * Trains a model based on the events given to this resolver via #addEvent.
+ *
+ * @throws IOException When the model can not be written out.
+ */
+ public void train() throws IOException;
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PerfectResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PerfectResolver.java
new file mode 100644
index 0000000..5d3053d
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PerfectResolver.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.DiscourseModel;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Resolver used in training to update the discourse model based on the coreference annotation.
+ */
+public class PerfectResolver extends AbstractResolver {
+
+ public PerfectResolver() {
+ super(0);
+ }
+
+ public boolean canResolve(MentionContext ec) {
+ return true;
+ }
+
+ @Override
+ protected boolean outOfRange(MentionContext ec, DiscourseEntity de) {
+ return false;
+ }
+
+ public DiscourseEntity resolve(MentionContext ec, DiscourseModel dm) {
+ return null;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PluralNounResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PluralNounResolver.java
new file mode 100644
index 0000000..53d66d4
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PluralNounResolver.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+
+/**
+ * Resolves coreference between plural nouns.
+ */
+public class PluralNounResolver extends MaxentResolver {
+
+ public PluralNounResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName,"plmodel", m, 80, true);
+ showExclusions = false;
+ }
+
+ public PluralNounResolver(String projectName, ResolverMode m, NonReferentialResolver nrr) throws IOException {
+ super(projectName,"plmodel", m, 80, true,nrr);
+ showExclusions = false;
+ }
+
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention, entity));
+ if (entity != null) {
+ features.addAll(ResolverUtils.getContextFeatures(mention));
+ features.addAll(ResolverUtils.getStringMatchFeatures(mention,entity));
+ }
+
+ return features;
+ }
+
+ public boolean canResolve(MentionContext mention) {
+ String firstTok = mention.getFirstTokenText().toLowerCase();
+ String firstTokTag = mention.getFirstToken().getSyntacticType();
+ boolean rv = mention.getHeadTokenTag().equals("NNS") && !ResolverUtils.definiteArticle(firstTok, firstTokTag);
+ return rv;
+ }
+
+ @Override
+ protected boolean excluded(MentionContext mention, DiscourseEntity entity) {
+ if (super.excluded(mention,entity)) {
+ return true;
+ }
+ else {
+ MentionContext cec = entity.getLastExtent();
+ return (!cec.getHeadTokenTag().equals("NNS") || super.excluded(mention, entity));
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PluralPronounResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PluralPronounResolver.java
new file mode 100644
index 0000000..85c8c59
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/PluralPronounResolver.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Resolves coreference between plural pronouns and their referents.
+ */
+public class PluralPronounResolver extends MaxentResolver {
+
+ int NUM_SENTS_BACK_PRONOUNS = 2;
+
+ public PluralPronounResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName, "tmodel", m, 30);
+ }
+
+ public PluralPronounResolver(String projectName, ResolverMode m,NonReferentialResolver nrr) throws IOException {
+ super(projectName, "tmodel", m, 30,nrr);
+ }
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention,entity));
+ //features.add("eid="+pc.id);
+ if (entity != null) { //generate pronoun w/ referent features
+ features.addAll(ResolverUtils.getPronounMatchFeatures(mention,entity));
+ MentionContext cec = entity.getLastExtent();
+ features.addAll(ResolverUtils.getDistanceFeatures(mention,entity));
+ features.addAll(ResolverUtils.getContextFeatures(cec));
+ features.add(ResolverUtils.getMentionCountFeature(entity));
+ /*
+ //lexical features
+ Set featureSet = new HashSet();
+ for (Iterator ei = entity.getExtents(); ei.hasNext();) {
+ MentionContext ec = (MentionContext) ei.next();
+ int headIndex = PTBHeadFinder.getInstance().getHeadIndex(ec.tokens);
+ Parse tok = (Parse) ec.tokens.get(headIndex);
+ featureSet.add("hw=" + tok.toString().toLowerCase());
+ if (ec.parse.isCoordinatedNounPhrase()) {
+ featureSet.add("ht=CC");
+ }
+ else {
+ featureSet.add("ht=" + tok.getSyntacticType());
+ }
+ if (ec.neType != null){
+ featureSet.add("ne="+ec.neType);
+ }
+ }
+ Iterator fset = featureSet.iterator();
+ while (fset.hasNext()) {
+ String f = (String) fset.next();
+ features.add(f);
+ }
+ */
+ }
+ return (features);
+ }
+
+ @Override
+ protected boolean outOfRange(MentionContext mention, DiscourseEntity entity) {
+ MentionContext cec = entity.getLastExtent();
+ //System.err.println("MaxentPluralPronounResolver.outOfRange: ["+ec.toText()+" ("+ec.id+")] ["+cec.toText()+" ("+cec.id+")] ec.sentenceNumber=("+ec.sentenceNumber+")-cec.sentenceNumber=("+cec.sentenceNumber+") > "+NUM_SENTS_BACK_PRONOUNS);
+ return (mention.getSentenceNumber() - cec.getSentenceNumber() > NUM_SENTS_BACK_PRONOUNS);
+ }
+
+ public boolean canResolve(MentionContext mention) {
+ String tag = mention.getHeadTokenTag();
+ return (tag != null && tag.startsWith("PRP") && ResolverUtils.pluralThirdPersonPronounPattern.matcher(mention.getHeadTokenText()).matches());
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ProperNounResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ProperNounResolver.java
new file mode 100644
index 0000000..e922af2
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ProperNounResolver.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Resolves coreference between proper nouns.
+ */
+public class ProperNounResolver extends MaxentResolver {
+
+ private static Map<String, Set<String>> acroMap;
+ private static boolean acroMapLoaded = false;
+
+ public ProperNounResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName,"pnmodel", m, 500);
+ if (!acroMapLoaded) {
+ initAcronyms(projectName + "/acronyms");
+ acroMapLoaded = true;
+ }
+ showExclusions = false;
+ }
+
+ public ProperNounResolver(String projectName, ResolverMode m,NonReferentialResolver nonRefResolver) throws IOException {
+ super(projectName,"pnmodel", m, 500,nonRefResolver);
+ if (!acroMapLoaded) {
+ initAcronyms(projectName + "/acronyms");
+ acroMapLoaded = true;
+ }
+ showExclusions = false;
+ }
+
+ public boolean canResolve(MentionContext mention) {
+ return (mention.getHeadTokenTag().startsWith("NNP") || mention.getHeadTokenTag().startsWith("CD"));
+ }
+
+ private void initAcronyms(String name) {
+ acroMap = new HashMap<String, Set<String>>(15000);
+ try {
+ BufferedReader str;
+ str = new BufferedReader(new FileReader(name));
+ //System.err.println("Reading acronyms database: " + file + " ");
+ String line;
+ while (null != (line = str.readLine())) {
+ StringTokenizer st = new StringTokenizer(line, "\t");
+ String acro = st.nextToken();
+ String full = st.nextToken();
+ Set<String> exSet = acroMap.get(acro);
+ if (exSet == null) {
+ exSet = new HashSet<String>();
+ acroMap.put(acro, exSet);
+ }
+ exSet.add(full);
+ exSet = acroMap.get(full);
+ if (exSet == null) {
+ exSet = new HashSet<String>();
+ acroMap.put(full, exSet);
+ }
+ exSet.add(acro);
+ }
+ }
+ catch (IOException e) {
+ System.err.println("ProperNounResolver.initAcronyms: Acronym Database not found: " + e);
+ }
+ }
+
+ private boolean isAcronym(String ecStrip, String xecStrip) {
+ Set<String> exSet = acroMap.get(ecStrip);
+ if (exSet != null && exSet.contains(xecStrip)) {
+ return true;
+ }
+ return false;
+ }
+
+ protected List<String> getAcronymFeatures(MentionContext mention, DiscourseEntity entity) {
+ MentionContext xec = ResolverUtils.getProperNounExtent(entity);
+ String ecStrip = ResolverUtils.stripNp(mention);
+ String xecStrip = ResolverUtils.stripNp(xec);
+ if (ecStrip != null && xecStrip != null) {
+ if (isAcronym(ecStrip, xecStrip)) {
+ List<String> features = new ArrayList<String>(1);
+ features.add("knownAcronym");
+ return features;
+ }
+ }
+ return Collections.emptyList();
+ }
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ //System.err.println("ProperNounResolver.getFeatures: "+mention.toText()+" -> "+entity);
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention, entity));
+ if (entity != null) {
+ features.addAll(ResolverUtils.getStringMatchFeatures(mention, entity));
+ features.addAll(getAcronymFeatures(mention, entity));
+ }
+ return features;
+ }
+
+ @Override
+ public boolean excluded(MentionContext mention, DiscourseEntity entity) {
+ if (super.excluded(mention, entity)) {
+ return true;
+ }
+
+ for (Iterator<MentionContext> ei = entity.getMentions(); ei.hasNext();) {
+ MentionContext xec = ei.next();
+ if (xec.getHeadTokenTag().startsWith("NNP")) { // || initialCaps.matcher(xec.headToken.toString()).find()) {
+ //System.err.println("MaxentProperNounResolver.exclude: kept "+xec.toText()+" with "+xec.headTag);
+ return false;
+ }
+ }
+
+ return true;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/Resolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/Resolver.java
new file mode 100644
index 0000000..f237c7e
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/Resolver.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.DiscourseModel;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Interface for coreference resolvers.
+ */
+public interface Resolver {
+
+ /**
+ * Returns true if this resolver is able to resolve the referring expression of the same type
+ * as the specified mention.
+ *
+ * @param mention The mention being considered for resolution.
+ *
+ * @return true if the resolver handles this type of referring
+ * expression, false otherwise.
+ */
+ public boolean canResolve(MentionContext mention);
+
+ /**
+ * Resolve this referring expression to a discourse entity in the discourse model.
+ *
+ * @param ec the referring expression.
+ * @param dm the discourse model.
+ *
+ * @return the discourse entity which the resolver believes this
+ * referring expression refers to or null if no discourse entity is
+ * coreferent with the referring expression.
+ */
+ public DiscourseEntity resolve(MentionContext ec, DiscourseModel dm);
+
+ /**
+ * Uses the specified mention and discourse model to train this resolver.
+ * All mentions sent to this method need to have their id fields set to indicate coreference
+ * relationships.
+ *
+ * @param mention The mention which is being used for training.
+ * @param model the discourse model.
+ *
+ * @return the discourse entity which is referred to by the referring
+ * expression or null if no discourse entity is referenced.
+ */
+ public DiscourseEntity retain(MentionContext mention, DiscourseModel model);
+
+ /**
+ * Retrains model on examples for which retain was called.
+ *
+ * @throws IOException
+ */
+ public void train() throws IOException;
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ResolverMode.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ResolverMode.java
new file mode 100644
index 0000000..bee5337
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ResolverMode.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+/**
+ * Enumerated type specifying the modes if a resolver.
+ */
+public enum ResolverMode {
+ TEST,
+ TRAIN
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ResolverUtils.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ResolverUtils.java
new file mode 100644
index 0000000..41ac100
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/ResolverUtils.java
@@ -0,0 +1,646 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+import opennlp.tools.coref.mention.Parse;
+import opennlp.tools.coref.sim.GenderEnum;
+import opennlp.tools.coref.sim.NumberEnum;
+import opennlp.tools.coref.sim.TestSimilarityModel;
+
+/**
+ * This class provides a set of utilities for turning mentions into normalized strings and features.
+ */
+public class ResolverUtils {
+
+ private static final Pattern ENDS_WITH_PERIOD = Pattern.compile("\\.$");
+ private static final Pattern initialCaps = Pattern.compile("^[A-Z]");
+
+ /** Regular expression for English singular third person pronouns. */
+ public static final Pattern singularThirdPersonPronounPattern = Pattern.compile("^(he|she|it|him|her|his|hers|its|himself|herself|itself)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English plural third person pronouns. */
+ public static final Pattern pluralThirdPersonPronounPattern = Pattern.compile("^(they|their|theirs|them|themselves)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English speech pronouns. */
+ public static final Pattern speechPronounPattern = Pattern.compile("^(I|me|my|you|your|you|we|us|our|ours)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English female pronouns. */
+ public static final Pattern femalePronounPattern = Pattern.compile("^(she|her|hers|herself)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English neuter pronouns. */
+ public static final Pattern neuterPronounPattern = Pattern.compile("^(it|its|itself)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English first person pronouns. */
+ public static final Pattern firstPersonPronounPattern = Pattern.compile("^(I|me|my|we|our|us|ours)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English singular second person pronouns. */
+ public static final Pattern secondPersonPronounPattern = Pattern.compile("^(you|your|yours)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English third person pronouns. */
+ public static final Pattern thirdPersonPronounPattern = Pattern.compile("^(he|she|it|him|her|his|hers|its|himself|herself|itself|they|their|theirs|them|themselves)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English singular pronouns. */
+ public static final Pattern singularPronounPattern = Pattern.compile("^(I|me|my|he|she|it|him|her|his|hers|its|himself|herself|itself)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English plural pronouns. */
+ public static final Pattern pluralPronounPattern = Pattern.compile("^(we|us|our|ours|they|their|theirs|them|themselves)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English male pronouns. */
+ public static final Pattern malePronounPattern = Pattern.compile("^(he|him|his|himself)$",Pattern.CASE_INSENSITIVE);
+ /** Regular expression for English honorifics. */
+ public static final Pattern honorificsPattern = Pattern.compile("[A-Z][a-z]+\\.$|^[A-Z][b-df-hj-np-tv-xz]+$");
+ /** Regular expression for English corporate designators. */
+ public static final Pattern designatorsPattern = Pattern.compile("[a-z]\\.$|^[A-Z][b-df-hj-np-tv-xz]+$|^Co(rp)?$");
+
+
+ private static final String NUM_COMPATIBLE = "num.compatible";
+ private static final String NUM_INCOMPATIBLE = "num.incompatible";
+ private static final String NUM_UNKNOWN = "num.unknown";
+
+ private static final String GEN_COMPATIBLE = "gen.compatible";
+ private static final String GEN_INCOMPATIBLE = "gen.incompatible";
+ private static final String GEN_UNKNOWN = "gen.unknown";
+ private static final String SIM_COMPATIBLE = "sim.compatible";
+ private static final String SIM_INCOMPATIBLE = "sim.incompatible";
+ private static final String SIM_UNKNOWN = "sim.unknown";
+
+
+ private static final double MIN_SIM_PROB = 0.60;
+
+
+
+ /**
+ * Returns a list of features based on the surrounding context of the specified mention.
+ * @param mention he mention whose surround context the features model.
+ * @return a list of features based on the surrounding context of the specified mention
+ */
+ public static List<String> getContextFeatures(MentionContext mention) {
+ List<String> features = new ArrayList<String>();
+ if (mention.getPreviousToken() != null) {
+ features.add("pt=" + mention.getPreviousToken().getSyntacticType());
+ features.add("pw=" + mention.getPreviousToken().toString());
+ }
+ else {
+ features.add("pt=BOS");
+ features.add("pw=BOS");
+ }
+ if (mention.getNextToken() != null) {
+ features.add("nt=" + mention.getNextToken().getSyntacticType());
+ features.add("nw=" + mention.getNextToken().toString());
+ }
+ else {
+ features.add("nt=EOS");
+ features.add("nw=EOS");
+ }
+ if (mention.getNextTokenBasal() != null) {
+ features.add("bnt=" + mention.getNextTokenBasal().getSyntacticType());
+ features.add("bnw=" + mention.getNextTokenBasal().toString());
+ }
+ else {
+ features.add("bnt=EOS");
+ features.add("bnw=EOS");
+ }
+ return (features);
+ }
+
+ /**
+ * Returns a list of word features for the specified tokens.
+ * @param token The token for which features are to be computed.
+ * @return a list of word features for the specified tokens.
+ */
+ public static List<String> getWordFeatures(Parse token) {
+ List<String> wordFeatures = new ArrayList<String>();
+ String word = token.toString().toLowerCase();
+ String wf = "";
+ if (ENDS_WITH_PERIOD.matcher(word).find()) {
+ wf = ",endWithPeriod";
+ }
+ String tokTag = token.getSyntacticType();
+ wordFeatures.add("w=" + word + ",t=" + tokTag + wf);
+ wordFeatures.add("t=" + tokTag + wf);
+ return wordFeatures;
+ }
+
+ public static Set<String> constructModifierSet(Parse[] tokens, int headIndex) {
+ Set<String> modSet = new HashSet<String>();
+ for (int ti = 0; ti < headIndex; ti++) {
+ Parse tok = tokens[ti];
+ modSet.add(tok.toString().toLowerCase());
+ }
+ return (modSet);
+ }
+
+ public static String excludedDeterminerMentionString(MentionContext ec) {
+ StringBuilder sb = new StringBuilder();
+ boolean first = true;
+ Parse[] mtokens = ec.getTokenParses();
+ for (int ti = 0, tl = mtokens.length; ti < tl; ti++) {
+ Parse token = mtokens[ti];
+ String tag = token.getSyntacticType();
+ if (!tag.equals("DT")) {
+ if (!first) {
+ sb.append(" ");
+ }
+ sb.append(token.toString());
+ first = false;
+ }
+ }
+ return sb.toString();
+ }
+
+ public static String excludedHonorificMentionString(MentionContext ec) {
+ StringBuilder sb = new StringBuilder();
+ boolean first = true;
+ Object[] mtokens = ec.getTokens();
+ for (int ti = 0, tl = mtokens.length; ti < tl; ti++) {
+ String token = mtokens[ti].toString();
+ if (!honorificsPattern.matcher(token).matches()) {
+ if (!first) {
+ sb.append(" ");
+ }
+ sb.append(token);
+ first = false;
+ }
+ }
+ return sb.toString();
+ }
+
+ public static String excludedTheMentionString(MentionContext ec) {
+ StringBuilder sb = new StringBuilder();
+ boolean first = true;
+ Object[] mtokens = ec.getTokens();
+ for (int ti = 0, tl = mtokens.length; ti < tl; ti++) {
+ String token = mtokens[ti].toString();
+ if (!token.equals("the") && !token.equals("The") && !token.equals("THE")) {
+ if (!first) {
+ sb.append(" ");
+ }
+ sb.append(token);
+ first = false;
+ }
+ }
+ return sb.toString();
+ }
+
+ public static String getExactMatchFeature(MentionContext ec, MentionContext xec) {
+ //System.err.println("getExactMatchFeature: ec="+mentionString(ec)+" mc="+mentionString(xec));
+ if (mentionString(ec).equals(mentionString(xec))) {
+ return "exactMatch";
+ }
+ else if (excludedHonorificMentionString(ec).equals(excludedHonorificMentionString(xec))) {
+ return "exactMatchNoHonor";
+ }
+ else if (excludedTheMentionString(ec).equals(excludedTheMentionString(xec))) {
+ return "exactMatchNoThe";
+ }
+ else if (excludedDeterminerMentionString(ec).equals(excludedDeterminerMentionString(xec))) {
+ return "exactMatchNoDT";
+ }
+ return null;
+ }
+
+ /**
+ * Returns string-match features for the the specified mention and entity.
+ * @param mention The mention.
+ * @param entity The entity.
+ * @return list of string-match features for the the specified mention and entity.
+ */
+ public static List<String> getStringMatchFeatures(MentionContext mention, DiscourseEntity entity) {
+ boolean sameHead = false;
+ boolean modsMatch = false;
+ boolean titleMatch = false;
+ boolean nonTheModsMatch = false;
+ List<String> features = new ArrayList<String>();
+ Parse[] mtokens = mention.getTokenParses();
+ Set<String> ecModSet = constructModifierSet(mtokens, mention.getHeadTokenIndex());
+ String mentionHeadString = mention.getHeadTokenText().toLowerCase();
+ Set<String> featureSet = new HashSet<String>();
+ for (Iterator<MentionContext> ei = entity.getMentions(); ei.hasNext();) {
+ MentionContext entityMention = ei.next();
+ String exactMatchFeature = getExactMatchFeature(entityMention, mention);
+ if (exactMatchFeature != null) {
+ featureSet.add(exactMatchFeature);
+ }
+ else if (entityMention.getParse().isCoordinatedNounPhrase() && !mention.getParse().isCoordinatedNounPhrase()) {
+ featureSet.add("cmix");
+ }
+ else {
+ String mentionStrip = stripNp(mention);
+ String entityMentionStrip = stripNp(entityMention);
+ if (mentionStrip != null && entityMentionStrip != null) {
+ if (isSubstring(mentionStrip, entityMentionStrip)) {
+ featureSet.add("substring");
+ }
+ }
+ }
+ Parse[] xtoks = entityMention.getTokenParses();
+ int headIndex = entityMention.getHeadTokenIndex();
+ //if (!mention.getHeadTokenTag().equals(entityMention.getHeadTokenTag())) {
+ // //System.err.println("skipping "+mention.headTokenText+" with "+xec.headTokenText+" because "+mention.headTokenTag+" != "+xec.headTokenTag);
+ // continue;
+ //} want to match NN NNP
+ String entityMentionHeadString = entityMention.getHeadTokenText().toLowerCase();
+ // model lexical similarity
+ if (mentionHeadString.equals(entityMentionHeadString)) {
+ sameHead = true;
+ featureSet.add("hds=" + mentionHeadString);
+ if (!modsMatch || !nonTheModsMatch) { //only check if we haven't already found one which is the same
+ modsMatch = true;
+ nonTheModsMatch = true;
+ Set<String> entityMentionModifierSet = constructModifierSet(xtoks, headIndex);
+ for (Iterator<String> mi = ecModSet.iterator(); mi.hasNext();) {
+ String mw = mi.next();
+ if (!entityMentionModifierSet.contains(mw)) {
+ modsMatch = false;
+ if (!mw.equals("the")) {
+ nonTheModsMatch = false;
+ featureSet.add("mmw=" + mw);
+ }
+ }
+ }
+ }
+ }
+ Set<String> descModSet = constructModifierSet(xtoks, entityMention.getNonDescriptorStart());
+ if (descModSet.contains(mentionHeadString)) {
+ titleMatch = true;
+ }
+ }
+ if (!featureSet.isEmpty()) {
+ features.addAll(featureSet);
+ }
+ if (sameHead) {
+ features.add("sameHead");
+ if (modsMatch) {
+ features.add("modsMatch");
+ }
+ else if (nonTheModsMatch) {
+ features.add("nonTheModsMatch");
+ }
+ else {
+ features.add("modsMisMatch");
+ }
+ }
+ if (titleMatch) {
+ features.add("titleMatch");
+ }
+ return features;
+ }
+
+ public static boolean isSubstring(String ecStrip, String xecStrip) {
+ //System.err.println("MaxentResolver.isSubstring: ec="+ecStrip+" xec="+xecStrip);
+ int io = xecStrip.indexOf(ecStrip);
+ if (io != -1) {
+ //check boundries
+ if (io != 0 && xecStrip.charAt(io - 1) != ' ') {
+ return false;
+ }
+ int end = io + ecStrip.length();
+ if (end != xecStrip.length() && xecStrip.charAt(end) != ' ') {
+ return false;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ public static String mentionString(MentionContext ec) {
+ StringBuilder sb = new StringBuilder();
+ Object[] mtokens = ec.getTokens();
+ sb.append(mtokens[0].toString());
+ for (int ti = 1, tl = mtokens.length; ti < tl; ti++) {
+ String token = mtokens[ti].toString();
+ sb.append(" ").append(token);
+ }
+ //System.err.println("mentionString "+ec+" == "+sb.toString()+" mtokens.length="+mtokens.length);
+ return sb.toString();
+ }
+
+ /**
+ * Returns a string for the specified mention with punctuation, honorifics,
+ * designators, and determiners removed.
+ *
+ * @param mention The mention to be striped.
+ *
+ * @return a normalized string representation of the specified mention.
+ */
+ public static String stripNp(MentionContext mention) {
+ int start=mention.getNonDescriptorStart(); //start after descriptors
+
+ Parse[] mtokens = mention.getTokenParses();
+ int end=mention.getHeadTokenIndex()+1;
+ if (start == end) {
+ //System.err.println("stripNp: return null 1");
+ return null;
+ }
+ //strip determiners
+ if (mtokens[start].getSyntacticType().equals("DT")) {
+ start++;
+ }
+ if (start == end) {
+ //System.err.println("stripNp: return null 2");
+ return null;
+ }
+ //get to first NNP
+ String type;
+ for (int i=start;i<end;i++) {
+ type = mtokens[start].getSyntacticType();
+ if (type.startsWith("NNP")) {
+ break;
+ }
+ start++;
+ }
+ if (start == end) {
+ //System.err.println("stripNp: return null 3");
+ return null;
+ }
+ if (start+1 != end) { // don't do this on head words, to keep "U.S."
+ //strip off honorifics in begining
+ if (honorificsPattern.matcher(mtokens[start].toString()).find()) {
+ start++;
+ }
+ if (start == end) {
+ //System.err.println("stripNp: return null 4");
+ return null;
+ }
+ //strip off and honerifics on the end
+ if (designatorsPattern.matcher(mtokens[mtokens.length - 1].toString()).find()) {
+ end--;
+ }
+ }
+ if (start == end) {
+ //System.err.println("stripNp: return null 5");
+ return null;
+ }
+ String strip = "";
+ for (int i = start; i < end; i++) {
+ strip += mtokens[i].toString() + ' ';
+ }
+ return strip.trim();
+ }
+
+ public static MentionContext getProperNounExtent(DiscourseEntity de) {
+ for (Iterator<MentionContext> ei = de.getMentions(); ei.hasNext();) { //use first extent which is propername
+ MentionContext xec = ei.next();
+ String xecHeadTag = xec.getHeadTokenTag();
+ if (xecHeadTag.startsWith("NNP") || initialCaps.matcher(xec.getHeadTokenText()).find()) {
+ return xec;
+ }
+ }
+ return null;
+ }
+
+ private static Map<String, String> getPronounFeatureMap(String pronoun) {
+ Map<String, String> pronounMap = new HashMap<String, String>();
+ if (malePronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("gender","male");
+ }
+ else if (femalePronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("gender","female");
+ }
+ else if (neuterPronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("gender","neuter");
+ }
+ if (singularPronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("number","singular");
+ }
+ else if (pluralPronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("number","plural");
+ }
+ /*
+ if (Linker.firstPersonPronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("person","first");
+ }
+ else if (Linker.secondPersonPronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("person","second");
+ }
+ else if (Linker.thirdPersonPronounPattern.matcher(pronoun).matches()) {
+ pronounMap.put("person","third");
+ }
+ */
+ return pronounMap;
+ }
+
+ /**
+ * Returns features indicating whether the specified mention is compatible with the pronouns
+ * of the specified entity.
+ * @param mention The mention.
+ * @param entity The entity.
+ * @return list of features indicating whether the specified mention is compatible with the pronouns
+ * of the specified entity.
+ */
+ public static List<String> getPronounMatchFeatures(MentionContext mention, DiscourseEntity entity) {
+ boolean foundCompatiblePronoun = false;
+ boolean foundIncompatiblePronoun = false;
+ if (mention.getHeadTokenTag().startsWith("PRP")) {
+ Map<String, String> pronounMap = getPronounFeatureMap(mention.getHeadTokenText());
+ //System.err.println("getPronounMatchFeatures.pronounMap:"+pronounMap);
+ for (Iterator<MentionContext> mi=entity.getMentions();mi.hasNext();) {
+ MentionContext candidateMention = mi.next();
+ if (candidateMention.getHeadTokenTag().startsWith("PRP")) {
+ if (mention.getHeadTokenText().equalsIgnoreCase(candidateMention.getHeadTokenText())) {
+ foundCompatiblePronoun = true;
+ break;
+ }
+ else {
+ Map<String, String> candidatePronounMap = getPronounFeatureMap(candidateMention.getHeadTokenText());
+ //System.err.println("getPronounMatchFeatures.candidatePronounMap:"+candidatePronounMap);
+ boolean allKeysMatch = true;
+ for (Iterator<String> ki = pronounMap.keySet().iterator(); ki.hasNext();) {
+ String key = ki.next();
+ String cfv = candidatePronounMap.get(key);
+ if (cfv != null) {
+ if (!pronounMap.get(key).equals(cfv)) {
+ foundIncompatiblePronoun = true;
+ allKeysMatch = false;
+ }
+ }
+ else {
+ allKeysMatch = false;
+ }
+ }
+ if (allKeysMatch) {
+ foundCompatiblePronoun = true;
+ }
+ }
+ }
+ }
+ }
+ List<String> pronounFeatures = new ArrayList<String>();
+ if (foundCompatiblePronoun) {
+ pronounFeatures.add("compatiblePronoun");
+ }
+ if (foundIncompatiblePronoun) {
+ pronounFeatures.add("incompatiblePronoun");
+ }
+ return pronounFeatures;
+ }
+
+ /**
+ * Returns distance features for the specified mention and entity.
+ * @param mention The mention.
+ * @param entity The entity.
+ * @return list of distance features for the specified mention and entity.
+ */
+ public static List<String> getDistanceFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ MentionContext cec = entity.getLastExtent();
+ int entityDistance = mention.getNounPhraseDocumentIndex()- cec.getNounPhraseDocumentIndex();
+ int sentenceDistance = mention.getSentenceNumber() - cec.getSentenceNumber();
+ int hobbsEntityDistance;
+ if (sentenceDistance == 0) {
+ hobbsEntityDistance = cec.getNounPhraseSentenceIndex();
+ }
+ else {
+ //hobbsEntityDistance = entityDistance - (entities within sentence from mention to end) + (entities within sentence form start to mention)
+ //hobbsEntityDistance = entityDistance - (cec.maxNounLocation - cec.getNounPhraseSentenceIndex) + cec.getNounPhraseSentenceIndex;
+ hobbsEntityDistance = entityDistance + (2 * cec.getNounPhraseSentenceIndex()) - cec.getMaxNounPhraseSentenceIndex();
+ }
+ features.add("hd=" + hobbsEntityDistance);
+ features.add("de=" + entityDistance);
+ features.add("ds=" + sentenceDistance);
+ //features.add("ds=" + sdist + pronoun);
+ //features.add("dn=" + cec.sentenceNumber);
+ //features.add("ep=" + cec.nounLocation);
+ return (features);
+ }
+
+ /**
+ * Returns whether the specified token is a definite article.
+ * @param tok The token.
+ * @param tag The pos-tag for the specified token.
+ * @return whether the specified token is a definite article.
+ */
+ public static boolean definiteArticle(String tok, String tag) {
+ tok = tok.toLowerCase();
+ if (tok.equals("the") || tok.equals("these") || tok.equals("these") || tag.equals("PRP$")) {
+ return (true);
+ }
+ return (false);
+ }
+
+ public static String getNumberCompatibilityFeature(MentionContext ec, DiscourseEntity de) {
+ NumberEnum en = de.getNumber();
+ if (en == NumberEnum.UNKNOWN || ec.getNumber() == NumberEnum.UNKNOWN) {
+ return NUM_UNKNOWN;
+ }
+ else if (ec.getNumber() == en) {
+ return NUM_COMPATIBLE;
+ }
+ else {
+ return NUM_INCOMPATIBLE;
+ }
+ }
+
+
+
+ /**
+ * Returns features indicating whether the specified mention and the specified entity are compatible.
+ * @param mention The mention.
+ * @param entity The entity.
+ * @return list of features indicating whether the specified mention and the specified entity are compatible.
+ */
+ public static List<String> getCompatibilityFeatures(MentionContext mention, DiscourseEntity entity, TestSimilarityModel simModel) {
+ List<String> compatFeatures = new ArrayList<String>();
+ String semCompatible = getSemanticCompatibilityFeature(mention, entity, simModel);
+ compatFeatures.add(semCompatible);
+ String genCompatible = getGenderCompatibilityFeature(mention, entity);
+ compatFeatures.add(genCompatible);
+ String numCompatible = ResolverUtils.getNumberCompatibilityFeature(mention, entity);
+ compatFeatures.add(numCompatible);
+ if (semCompatible.equals(SIM_COMPATIBLE) && genCompatible.equals(GEN_COMPATIBLE) && numCompatible.equals(ResolverUtils.NUM_COMPATIBLE)) {
+ compatFeatures.add("all.compatible");
+ }
+ else if (semCompatible.equals(SIM_INCOMPATIBLE) || genCompatible.equals(GEN_INCOMPATIBLE) || numCompatible.equals(ResolverUtils.NUM_INCOMPATIBLE)) {
+ compatFeatures.add("some.incompatible");
+ }
+ return compatFeatures;
+ }
+
+ public static String getGenderCompatibilityFeature(MentionContext ec, DiscourseEntity de) {
+ GenderEnum eg = de.getGender();
+ //System.err.println("getGenderCompatibility: mention="+ec.getGender()+" entity="+eg);
+ if (eg == GenderEnum.UNKNOWN || ec.getGender() == GenderEnum.UNKNOWN) {
+ return GEN_UNKNOWN;
+ }
+ else if (ec.getGender() == eg) {
+ return GEN_COMPATIBLE;
+ }
+ else {
+ return GEN_INCOMPATIBLE;
+ }
+ }
+
+ public static String getSemanticCompatibilityFeature(MentionContext ec, DiscourseEntity de, TestSimilarityModel simModel) {
+ if (simModel != null) {
+ double best = 0;
+ for (Iterator<MentionContext> xi = de.getMentions(); xi.hasNext();) {
+ MentionContext ec2 = xi.next();
+ double sim = simModel.compatible(ec, ec2);
+ if (sim > best) {
+ best = sim;
+ }
+ }
+ if (best > MIN_SIM_PROB) {
+ return SIM_COMPATIBLE;
+ }
+ else if (best > (1 - MIN_SIM_PROB)) {
+ return SIM_UNKNOWN;
+ }
+ else {
+ return SIM_INCOMPATIBLE;
+ }
+ }
+ else {
+ System.err.println("MaxentResolver: Uninitialized Semantic Model");
+ return SIM_UNKNOWN;
+ }
+ }
+
+ public static String getMentionCountFeature(DiscourseEntity de) {
+ if (de.getNumMentions() >= 5) {
+ return ("mc=5+");
+ }
+ else {
+ return ("mc=" + de.getNumMentions());
+ }
+ }
+
+ /**
+ * Returns a string representing the gender of the specified pronoun.
+ * @param pronoun An English pronoun.
+ * @return the gender of the specified pronoun.
+ */
+ public static String getPronounGender(String pronoun) {
+ if (malePronounPattern.matcher(pronoun).matches()) {
+ return "m";
+ }
+ else if (femalePronounPattern.matcher(pronoun).matches()) {
+ return "f";
+ }
+ else if (neuterPronounPattern.matcher(pronoun).matches()) {
+ return "n";
+ }
+ else {
+ return "u";
+ }
+ }
+
+
+
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SingletonNonReferentialResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SingletonNonReferentialResolver.java
new file mode 100644
index 0000000..746f97d
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SingletonNonReferentialResolver.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+
+/**
+ * This class allows you to share a single instance of a non-referential resolver
+ * among several resolvers.
+ */
+public class SingletonNonReferentialResolver extends DefaultNonReferentialResolver {
+
+ private static SingletonNonReferentialResolver resolver;
+ private static boolean trained;
+
+ private SingletonNonReferentialResolver(String projectName, ResolverMode mode) throws IOException {
+ super(projectName, "nonref", mode);
+ }
+
+ public static SingletonNonReferentialResolver getInstance(String modelName, ResolverMode mode) throws IOException {
+ if (resolver == null) {
+ resolver = new SingletonNonReferentialResolver(modelName, mode);
+ }
+ return resolver;
+ }
+
+
+ @Override
+ public void train() throws IOException {
+ if (!trained) {
+ super.train();
+ trained = true;
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SingularPronounResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SingularPronounResolver.java
new file mode 100644
index 0000000..6e84140
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SingularPronounResolver.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * This class resolver singular pronouns such as "he", "she", "it" and their various forms.
+ */
+public class SingularPronounResolver extends MaxentResolver {
+
+ int mode;
+
+ Pattern PronounPattern;
+
+ public SingularPronounResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName, "pmodel", m, 30);
+ this.numSentencesBack = 2;
+ }
+
+ public SingularPronounResolver(String projectName, ResolverMode m, NonReferentialResolver nonReferentialResolver) throws IOException {
+ super(projectName, "pmodel", m, 30,nonReferentialResolver);
+ this.numSentencesBack = 2;
+ }
+
+ public boolean canResolve(MentionContext mention) {
+ //System.err.println("MaxentSingularPronounResolver.canResolve: ec= ("+mention.id+") "+ mention.toText());
+ String tag = mention.getHeadTokenTag();
+ return (tag != null && tag.startsWith("PRP") && ResolverUtils.singularThirdPersonPronounPattern.matcher(mention.getHeadTokenText()).matches());
+ }
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention, entity));
+ if (entity != null) { //generate pronoun w/ referent features
+ MentionContext cec = entity.getLastExtent();
+ //String gen = getPronounGender(pronoun);
+ features.addAll(ResolverUtils.getPronounMatchFeatures(mention,entity));
+ features.addAll(ResolverUtils.getContextFeatures(cec));
+ features.addAll(ResolverUtils.getDistanceFeatures(mention,entity));
+ features.add(ResolverUtils.getMentionCountFeature(entity));
+ /*
+ //lexical features
+ Set featureSet = new HashSet();
+ for (Iterator ei = entity.getExtents(); ei.hasNext();) {
+ MentionContext ec = (MentionContext) ei.next();
+ List toks = ec.tokens;
+ Parse tok;
+ int headIndex = PTBHeadFinder.getInstance().getHeadIndex(toks);
+ for (int ti = 0; ti < headIndex; ti++) {
+ tok = (Parse) toks.get(ti);
+ featureSet.add(gen + "mw=" + tok.toString().toLowerCase());
+ featureSet.add(gen + "mt=" + tok.getSyntacticType());
+ }
+ tok = (Parse) toks.get(headIndex);
+ featureSet.add(gen + "hw=" + tok.toString().toLowerCase());
+ featureSet.add(gen + "ht=" + tok.getSyntacticType());
+ //semantic features
+ if (ec.neType != null) {
+ featureSet.add(gen + "," + ec.neType);
+ }
+ else {
+ for (Iterator si = ec.synsets.iterator(); si.hasNext();) {
+ Integer synset = (Integer) si.next();
+ featureSet.add(gen + "," + synset);
+ }
+ }
+ }
+ Iterator fset = featureSet.iterator();
+ while (fset.hasNext()) {
+ String f = (String) fset.next();
+ features.add(f);
+ }
+ */
+ }
+ return (features);
+ }
+
+ @Override
+ public boolean excluded(MentionContext mention, DiscourseEntity entity) {
+ if (super.excluded(mention, entity)) {
+ return (true);
+ }
+ String mentionGender = null;
+
+ for (Iterator<MentionContext> ei = entity.getMentions(); ei.hasNext();) {
+ MentionContext entityMention = ei.next();
+ String tag = entityMention.getHeadTokenTag();
+ if (tag != null && tag.startsWith("PRP") && ResolverUtils.singularThirdPersonPronounPattern.matcher(mention.getHeadTokenText()).matches()) {
+ if (mentionGender == null) { //lazy initialization
+ mentionGender = ResolverUtils.getPronounGender(mention.getHeadTokenText());
+ }
+ String entityGender = ResolverUtils.getPronounGender(entityMention.getHeadTokenText());
+ if (!entityGender.equals("u") && !mentionGender.equals(entityGender)) {
+ return (true);
+ }
+ }
+ }
+ return (false);
+ }
+
+ @Override
+ protected boolean outOfRange(MentionContext mention, DiscourseEntity entity) {
+ MentionContext cec = entity.getLastExtent();
+ //System.err.println("MaxentSingularPronounresolve.outOfRange: ["+entity.getLastExtent().toText()+" ("+entity.getId()+")] ["+mention.toText()+" ("+mention.getId()+")] entity.sentenceNumber=("+entity.getLastExtent().getSentenceNumber()+")-mention.sentenceNumber=("+mention.getSentenceNumber()+") > "+numSentencesBack);
+ return (mention.getSentenceNumber() - cec.getSentenceNumber() > numSentencesBack);
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SpeechPronounResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SpeechPronounResolver.java
new file mode 100644
index 0000000..bc5d2d4
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/SpeechPronounResolver.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.resolver;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.coref.DiscourseEntity;
+import opennlp.tools.coref.mention.MentionContext;
+
+/**
+ * Resolves pronouns specific to quoted speech such as "you", "me", and "I".
+ */
+public class SpeechPronounResolver extends MaxentResolver {
+
+ public SpeechPronounResolver(String projectName, ResolverMode m) throws IOException {
+ super(projectName,"fmodel", m, 30);
+ this.numSentencesBack = 0;
+ showExclusions = false;
+ preferFirstReferent = true;
+ }
+
+ public SpeechPronounResolver(String projectName, ResolverMode m, NonReferentialResolver nrr) throws IOException {
+ super(projectName,"fmodel", m, 30,nrr);
+ showExclusions = false;
+ preferFirstReferent = true;
+ }
+
+
+ @Override
+ protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
+ List<String> features = new ArrayList<String>();
+ features.addAll(super.getFeatures(mention, entity));
+ if (entity != null) {
+ features.addAll(ResolverUtils.getPronounMatchFeatures(mention,entity));
+ List<String> contexts = ResolverUtils.getContextFeatures(mention);
+ MentionContext cec = entity.getLastExtent();
+ if (mention.getHeadTokenTag().startsWith("PRP") && cec.getHeadTokenTag().startsWith("PRP")) {
+ features.add(mention.getHeadTokenText() + "," + cec.getHeadTokenText());
+ }
+ else if (mention.getHeadTokenText().startsWith("NNP")) {
+ for (int ci = 0, cl = contexts.size(); ci < cl; ci++) {
+ features.add(contexts.get(ci));
+ }
+ features.add(mention.getNameType() + "," + cec.getHeadTokenText());
+ }
+ else {
+ List<String> ccontexts = ResolverUtils.getContextFeatures(cec);
+ for (int ci = 0, cl = ccontexts.size(); ci < cl; ci++) {
+ features.add(ccontexts.get(ci));
+ }
+ features.add(cec.getNameType() + "," + mention.getHeadTokenText());
+ }
+ }
+ return (features);
+ }
+
+ @Override
+ protected boolean outOfRange(MentionContext mention, DiscourseEntity entity) {
+ MentionContext cec = entity.getLastExtent();
+ return (mention.getSentenceNumber() - cec.getSentenceNumber() > numSentencesBack);
+ }
+
+ public boolean canResolve(MentionContext mention) {
+ String tag = mention.getHeadTokenTag();
+ boolean fpp = tag != null && tag.startsWith("PRP") && ResolverUtils.speechPronounPattern.matcher(mention.getHeadTokenText()).matches();
+ boolean pn = tag != null && tag.startsWith("NNP");
+ return (fpp || pn);
+ }
+
+ @Override
+ protected boolean excluded(MentionContext mention, DiscourseEntity entity) {
+ if (super.excluded(mention, entity)) {
+ return true;
+ }
+ MentionContext cec = entity.getLastExtent();
+ if (!canResolve(cec)) {
+ return true;
+ }
+ if (mention.getHeadTokenTag().startsWith("NNP")) { //mention is a propernoun
+ if (cec.getHeadTokenTag().startsWith("NNP")) {
+ return true; // both NNP
+ }
+ else {
+ if (entity.getNumMentions() > 1) {
+ return true;
+ }
+ return !canResolve(cec);
+ }
+ }
+ else if (mention.getHeadTokenTag().startsWith("PRP")){ // mention is a speech pronoun
+ // cec can be either a speech pronoun or a propernoun
+ if (cec.getHeadTokenTag().startsWith("NNP")) {
+ //exclude antecedents not in the same sentence when they are not pronoun
+ return (mention.getSentenceNumber() - cec.getSentenceNumber() != 0);
+ }
+ else if (cec.getHeadTokenTag().startsWith("PRP")){
+ return false;
+ }
+ else {
+ System.err.println("Unexpected candidate exluded: "+cec.toText());
+ return true;
+ }
+ }
+ else {
+ System.err.println("Unexpected mention exluded: "+mention.toText());
+ return true;
+ }
+ }
+
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/package-info.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/package-info.java
new file mode 100644
index 0000000..fb59395
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Package related to resolution techniques for coreference resolution.
+ */
+package opennlp.tools.coref.resolver;
\ No newline at end of file
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Context.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Context.java
new file mode 100644
index 0000000..174437c
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Context.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.coref.mention.Dictionary;
+import opennlp.tools.coref.mention.DictionaryFactory;
+import opennlp.tools.coref.mention.HeadFinder;
+import opennlp.tools.coref.mention.Mention;
+import opennlp.tools.coref.mention.Parse;
+import opennlp.tools.util.Span;
+
+/**
+ * Specifies the context of a mention for computing gender, number, and semantic compatibility.
+ */
+public class Context extends Mention {
+
+ protected String headTokenText;
+ protected String headTokenTag;
+ protected Set<String> synsets;
+ protected Object[] tokens;
+
+ /** The token index in of the head word of this mention. */
+ protected int headTokenIndex;
+
+ public Context(Span span, Span headSpan, int entityId, Parse parse, String extentType, String nameType, HeadFinder headFinder) {
+ super(span,headSpan,entityId,parse,extentType,nameType);
+ init(headFinder);
+ }
+
+ public Context(Object[] tokens, String headToken, String headTag, String neType) {
+ super(null,null,1,null,null,neType);
+ this.tokens =tokens;
+ this.headTokenIndex = tokens.length-1;
+ this.headTokenText = headToken;
+ this.headTokenTag = headTag;
+ this.synsets = getSynsetSet(this);
+ }
+
+ public Context(Mention mention, HeadFinder headFinder) {
+ super(mention);
+ init(headFinder);
+ }
+
+ private void init(HeadFinder headFinder) {
+ Parse head = headFinder.getLastHead(parse);
+ List<Parse> tokenList = head.getTokens();
+ headTokenIndex = headFinder.getHeadIndex(head);
+ Parse headToken = headFinder.getHeadToken(head);
+ tokens = tokenList.toArray(new Parse[tokenList.size()]);
+ this.headTokenTag = headToken.getSyntacticType();
+ this.headTokenText = headToken.toString();
+ if (headTokenTag.startsWith("NN") && !headTokenTag.startsWith("NNP")) {
+ this.synsets = getSynsetSet(this);
+ }
+ else {
+ this.synsets = Collections.emptySet();
+ }
+ }
+
+
+ public static Context[] constructContexts(Mention[] mentions,HeadFinder headFinder) {
+ Context[] contexts = new Context[mentions.length];
+ for (int mi=0;mi<mentions.length;mi++) {
+ contexts[mi] = new Context(mentions[mi],headFinder);
+ }
+ return contexts;
+ }
+
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (int ti=0,tl=tokens.length;ti<tl;ti++){
+ sb.append(tokens[ti]).append(" ");
+ }
+ return sb.toString();
+ }
+
+ public Object[] getTokens() {
+ return tokens;
+ }
+
+ public String getHeadTokenText() {
+ return headTokenText;
+ }
+
+ public String getHeadTokenTag() {
+ return headTokenTag;
+ }
+
+ public Set<String> getSynsets() {
+ return synsets;
+ }
+
+ public static Context parseContext(String word) {
+ String[] parts = word.split("/");
+ if (parts.length == 2) {
+ String[] tokens = parts[0].split(" ");
+ return new Context(tokens,tokens[tokens.length-1], parts[1], null);
+ }
+ else if (parts.length == 3) {
+ String[] tokens = parts[0].split(" ");
+ return new Context(tokens,tokens[tokens.length-1], parts[1], parts[2]);
+ }
+ return null;
+ }
+
+ private static Set<String> getSynsetSet(Context c) {
+ Set<String> synsetSet = new HashSet<String>();
+ String[] lemmas = getLemmas(c);
+ Dictionary dict = DictionaryFactory.getDictionary();
+ //System.err.println(lemmas.length+" lemmas for "+c.headToken);
+ for (int li = 0; li < lemmas.length; li++) {
+ String senseKey = dict.getSenseKey(lemmas[li],"NN",0);
+ if (senseKey != null) {
+ synsetSet.add(senseKey);
+ String[] synsets = dict.getParentSenseKeys(lemmas[li],"NN",0);
+ for (int si=0,sn=synsets.length;si<sn;si++) {
+ synsetSet.add(synsets[si]);
+ }
+ }
+ }
+ return synsetSet;
+ }
+
+ private static String[] getLemmas(Context c) {
+ String word = c.headTokenText.toLowerCase();
+ return DictionaryFactory.getDictionary().getLemmas(word,"NN");
+ }
+
+ /** Returns the token index into the mention for the head word.
+ * @return the token index into the mention for the head word.
+ */
+ public int getHeadTokenIndex() {
+ return headTokenIndex;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Gender.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Gender.java
new file mode 100644
index 0000000..bb0c996
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Gender.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+/**
+ * Class which models the gender of an entity and the confidence of that association.
+ */
+public class Gender {
+
+ private GenderEnum type;
+ private double confidence;
+
+ public Gender(GenderEnum type,double confidence) {
+ this.type = type;
+ this.confidence = confidence;
+ }
+
+ public GenderEnum getType() {
+ return type;
+ }
+
+ public double getConfidence() {
+ return confidence;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderEnum.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderEnum.java
new file mode 100644
index 0000000..d9fb598
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderEnum.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+/**
+ * Enumeration of gender types.
+ */
+public class GenderEnum {
+ private String gender;
+
+ /** Male gender. */
+ public static final GenderEnum MALE = new GenderEnum("male");
+ /** Female gender. */
+ public static final GenderEnum FEMALE = new GenderEnum("female");
+ /** Nueter gender. */
+ public static final GenderEnum NEUTER = new GenderEnum("neuter");
+ /** Unknown gender. */
+ public static final GenderEnum UNKNOWN = new GenderEnum("unknown");
+
+ private GenderEnum(String g) {
+ gender = g;
+ }
+
+ @Override
+ public String toString() {
+ return gender;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderModel.java
new file mode 100644
index 0000000..33c63b6
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderModel.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.coref.sim;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.maxent.GIS;
+import opennlp.maxent.io.SuffixSensitiveGISModelReader;
+import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
+import opennlp.model.Event;
+import opennlp.model.MaxentModel;
+import opennlp.tools.coref.resolver.ResolverUtils;
+import opennlp.tools.util.CollectionEventStream;
+import opennlp.tools.util.HashList;
+
+/**
+ * Class which models the gender of a particular mentions and entities made up of mentions.
+ */
+public class GenderModel implements TestGenderModel, TrainSimilarityModel {
+
+ private int maleIndex;
+ private int femaleIndex;
+ private int neuterIndex;
+
+ private String modelName;
+ private String modelExtension = ".bin.gz";
+ private MaxentModel testModel;
+ private List<Event> events;
+ private boolean debugOn = true;
+
+ private Set<String> maleNames;
+ private Set<String> femaleNames;
+
+ public static TestGenderModel testModel(String name) throws IOException {
+ GenderModel gm = new GenderModel(name, false);
+ return gm;
+ }
+
+ public static TrainSimilarityModel trainModel(String name) throws IOException {
+ GenderModel gm = new GenderModel(name, true);
+ return gm;
+ }
+
+ private Set<String> readNames(String nameFile) throws IOException {
+ Set<String> names = new HashSet<String>();
+ BufferedReader nameReader = new BufferedReader(new FileReader(nameFile));
+ for (String line = nameReader.readLine(); line != null; line = nameReader.readLine()) {
+ names.add(line);
+ }
+ return names;
+ }
+
+ private GenderModel(String modelName, boolean train) throws IOException {
+ this.modelName = modelName;
+ maleNames = readNames(modelName+".mas");
+ femaleNames = readNames(modelName+".fem");
+ if (train) {
+ events = new ArrayList<Event>();
+ }
+ else {
+ //if (MaxentResolver.loadAsResource()) {
+ // testModel = (new BinaryGISModelReader(new DataInputStream(this.getClass().getResourceAsStream(modelName)))).getModel();
+ //}
+ testModel = (new SuffixSensitiveGISModelReader(new File(modelName+modelExtension))).getModel();
+ maleIndex = testModel.getIndex(GenderEnum.MALE.toString());
+ femaleIndex = testModel.getIndex(GenderEnum.FEMALE.toString());
+ neuterIndex = testModel.getIndex(GenderEnum.NEUTER.toString());
+ }
+ }
+
+ private List<String> getFeatures(Context np1) {
+ List<String> features = new ArrayList<String>();
+ features.add("default");
+ for (int ti = 0, tl = np1.getHeadTokenIndex(); ti < tl; ti++) {
+ features.add("mw=" + np1.getTokens()[ti].toString());
+ }
+ features.add("hw=" + np1.getHeadTokenText());
+ features.add("n="+np1.getNameType());
+ if (np1.getNameType() != null && np1.getNameType().equals("person")) {
+ Object[] tokens = np1.getTokens();
+ //System.err.println("GenderModel.getFeatures: person name="+np1);
+ for (int ti=0;ti<np1.getHeadTokenIndex() || ti==0;ti++) {
+ String name = tokens[ti].toString().toLowerCase();
+ if (femaleNames.contains(name)) {
+ features.add("fem");
+ //System.err.println("GenderModel.getFeatures: person (fem) "+np1);
+ }
+ if (maleNames.contains(name)) {
+ features.add("mas");
+ //System.err.println("GenderModel.getFeatures: person (mas) "+np1);
+ }
+ }
+ }
+
+ for (String si : np1.getSynsets()) {
+ features.add("ss=" + si);
+ }
+ return features;
+ }
+
+ private void addEvent(String outcome, Context np1) {
+ List<String> feats = getFeatures(np1);
+ events.add(new Event(outcome, feats.toArray(new String[feats.size()])));
+ }
+
+ /**
+ * Heuristic computation of gender for a mention context using pronouns and honorifics.
+ * @param mention The mention whose gender is to be computed.
+ * @return The heuristically determined gender or unknown.
+ */
+ private GenderEnum getGender(Context mention) {
+ if (ResolverUtils.malePronounPattern.matcher(mention.getHeadTokenText()).matches()) {
+ return GenderEnum.MALE;
+ }
+ else if (ResolverUtils.femalePronounPattern.matcher(mention.getHeadTokenText()).matches()) {
+ return GenderEnum.FEMALE;
+ }
+ else if (ResolverUtils.neuterPronounPattern.matcher(mention.getHeadTokenText()).matches()) {
+ return GenderEnum.NEUTER;
+ }
+ Object[] mtokens = mention.getTokens();
+ for (int ti = 0, tl = mtokens.length - 1; ti < tl; ti++) {
+ String token = mtokens[ti].toString();
+ if (token.equals("Mr.") || token.equals("Mr")) {
+ return GenderEnum.MALE;
+ }
+ else if (token.equals("Mrs.") || token.equals("Mrs") || token.equals("Ms.") || token.equals("Ms")) {
+ return GenderEnum.FEMALE;
+ }
+ }
+
+ return GenderEnum.UNKNOWN;
+ }
+
+ private GenderEnum getGender(List<Context> entity) {
+ for (Iterator<Context> ci = entity.iterator(); ci.hasNext();) {
+ Context ec = ci.next();
+ GenderEnum ge = getGender(ec);
+ if (ge != GenderEnum.UNKNOWN) {
+ return ge;
+ }
+ }
+
+ return GenderEnum.UNKNOWN;
+ }
+
+ @SuppressWarnings("unchecked")
+ public void setExtents(Context[] extentContexts) {
+ HashList entities = new HashList();
+ List<Context> singletons = new ArrayList<Context>();
+ for (int ei = 0, el = extentContexts.length; ei < el; ei++) {
+ Context ec = extentContexts[ei];
+ //System.err.println("GenderModel.setExtents: ec("+ec.getId()+") "+ec.toText());
+ if (ec.getId() != -1) {
+ entities.put(ec.getId(), ec);
+ }
+ else {
+ singletons.add(ec);
+ }
+ }
+ List<Context> males = new ArrayList<Context>();
+ List<Context> females = new ArrayList<Context>();
+ List<Context> eunuches = new ArrayList<Context>();
+ //coref entities
+ for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) {
+ Integer key = ei.next();
+ List<Context> entityContexts = (List<Context>) entities.get(key);
+ GenderEnum gender = getGender(entityContexts);
+ if (gender != null) {
+ if (gender == GenderEnum.MALE) {
+ males.addAll(entityContexts);
+ }
+ else if (gender == GenderEnum.FEMALE) {
+ females.addAll(entityContexts);
+ }
+ else if (gender == GenderEnum.NEUTER) {
+ eunuches.addAll(entityContexts);
+ }
+ }
+ }
+ //non-coref entities
+ for (Iterator<Context> ei = singletons.iterator(); ei.hasNext();) {
+ Context ec = ei.next();
+ GenderEnum gender = getGender(ec);
+ if (gender == GenderEnum.MALE) {
+ males.add(ec);
+ }
+ else if (gender == GenderEnum.FEMALE) {
+ females.add(ec);
+ }
+ else if (gender == GenderEnum.NEUTER) {
+ eunuches.add(ec);
+ }
+ }
+ for (Iterator<Context> mi = males.iterator(); mi.hasNext();) {
+ Context ec = mi.next();
+ addEvent(GenderEnum.MALE.toString(), ec);
+ }
+ for (Iterator<Context> fi = females.iterator(); fi.hasNext();) {
+ Context ec = fi.next();
+ addEvent(GenderEnum.FEMALE.toString(), ec);
+ }
+ for (Iterator<Context> ei = eunuches.iterator(); ei.hasNext();) {
+ Context ec = ei.next();
+ addEvent(GenderEnum.NEUTER.toString(), ec);
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ if (args.length == 0) {
+ System.err.println("Usage: GenderModel modelName < tiger/NN bear/NN");
+ System.exit(1);
+ }
+ String modelName = args[0];
+ GenderModel model = new GenderModel(modelName, false);
+ //Context.wn = new WordNet(System.getProperty("WNHOME"), true);
+ //Context.morphy = new Morphy(Context.wn);
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ for (String line = in.readLine(); line != null; line = in.readLine()) {
+ String[] words = line.split(" ");
+ double[] dist = model.genderDistribution(Context.parseContext(words[0]));
+ System.out.println("m="+dist[model.getMaleIndex()] + " f=" +dist[model.getFemaleIndex()]+" n="+dist[model.getNeuterIndex()]+" "+model.getFeatures(Context.parseContext(words[0])));
+ }
+ }
+
+ public double[] genderDistribution(Context np1) {
+ List<String> features = getFeatures(np1);
+ if (debugOn) {
+ //System.err.println("GenderModel.genderDistribution: "+features);
+ }
+ return testModel.eval(features.toArray(new String[features.size()]));
+ }
+
+ public void trainModel() throws IOException {
+ if (debugOn) {
+ FileWriter writer = new FileWriter(modelName+".events");
+ for (Iterator<Event> ei=events.iterator();ei.hasNext();) {
+ Event e = ei.next();
+ writer.write(e.toString()+"\n");
+ }
+ writer.close();
+ }
+ new SuffixSensitiveGISModelWriter(
+ GIS.trainModel(
+ new CollectionEventStream(events), true),
+ new File(modelName+modelExtension)).persist();
+ }
+
+ public int getFemaleIndex() {
+ return femaleIndex;
+ }
+
+ public int getMaleIndex() {
+ return maleIndex;
+ }
+
+ public int getNeuterIndex() {
+ return neuterIndex;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/MaxentCompatibilityModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/MaxentCompatibilityModel.java
new file mode 100644
index 0000000..b6e00a5
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/MaxentCompatibilityModel.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+import java.io.IOException;
+
+/**
+ * Model of mention compatibiltiy using a maxent model.
+ */
+public class MaxentCompatibilityModel {
+
+ private final double minGenderProb = 0.66;
+ private final double minNumberProb = 0.66;
+
+ private static TestGenderModel genModel;
+ private static TestNumberModel numModel;
+
+ private boolean debugOn = false;
+
+ public MaxentCompatibilityModel(String corefProject) throws IOException {
+ genModel = GenderModel.testModel(corefProject + "/gen");
+ numModel = NumberModel.testModel(corefProject + "/num");
+ }
+
+ public Gender computeGender(Context c) {
+ Gender gender;
+ double[] gdist = genModel.genderDistribution(c);
+ if (debugOn) {
+ System.err.println("MaxentCompatibilityModel.computeGender: "+c.toString()+" m="+gdist[genModel.getMaleIndex()]+" f="+gdist[genModel.getFemaleIndex()]+" n="+gdist[genModel.getNeuterIndex()]);
+ }
+ if (genModel.getMaleIndex() >= 0 && gdist[genModel.getMaleIndex()] > minGenderProb) {
+ gender = new Gender(GenderEnum.MALE,gdist[genModel.getMaleIndex()]);
+ }
+ else if (genModel.getFemaleIndex() >= 0 && gdist[genModel.getFemaleIndex()] > minGenderProb) {
+ gender = new Gender(GenderEnum.FEMALE,gdist[genModel.getFemaleIndex()]);
+ }
+ else if (genModel.getNeuterIndex() >= 0 && gdist[genModel.getNeuterIndex()] > minGenderProb) {
+ gender = new Gender(GenderEnum.NEUTER,gdist[genModel.getNeuterIndex()]);
+ }
+ else {
+ gender = new Gender(GenderEnum.UNKNOWN,minGenderProb);
+ }
+ return gender;
+ }
+
+ public Number computeNumber(Context c) {
+ double[] dist = numModel.numberDist(c);
+ Number number;
+ //System.err.println("MaxentCompatibiltyResolver.computeNumber: "+c+" sing="+dist[numModel.getSingularIndex()]+" plural="+dist[numModel.getPluralIndex()]);
+ if (dist[numModel.getSingularIndex()] > minNumberProb) {
+ number = new Number(NumberEnum.SINGULAR,dist[numModel.getSingularIndex()]);
+ }
+ else if (dist[numModel.getPluralIndex()] > minNumberProb) {
+ number = new Number(NumberEnum.PLURAL,dist[numModel.getPluralIndex()]);
+ }
+ else {
+ number = new Number(NumberEnum.UNKNOWN,minNumberProb);
+ }
+ return number;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Number.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Number.java
new file mode 100644
index 0000000..27c1e49
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/Number.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+/**
+ * Class which models the number of an entity and the confidence of that association.
+ */
+public class Number {
+ private NumberEnum type;
+ private double confidence;
+
+ public Number(NumberEnum type,double confidence) {
+ this.type = type;
+ this.confidence = confidence;
+ }
+
+ public NumberEnum getType() {
+ return type;
+ }
+
+ public double getConfidence() {
+ return confidence;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberEnum.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberEnum.java
new file mode 100644
index 0000000..693f894
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberEnum.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+/**
+ * Enumeration of number types.
+ */
+public class NumberEnum {
+
+ private final String name;
+
+ /**
+ * Singular number type.
+ */
+ public static final NumberEnum SINGULAR = new NumberEnum("singular");
+
+ /**
+ * Plural number type.
+ */
+ public static final NumberEnum PLURAL = new NumberEnum("plural");
+
+ /**
+ * Unknown number type.
+ */
+ public static final NumberEnum UNKNOWN = new NumberEnum("unknown");
+
+ private NumberEnum(String name) {
+ this.name = name;
+ }
+
+ @Override
+ public String toString(){
+ return name;
+ }
+
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberModel.java
new file mode 100644
index 0000000..1d4e47a
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberModel.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.maxent.GIS;
+import opennlp.maxent.io.SuffixSensitiveGISModelReader;
+import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
+import opennlp.model.Event;
+import opennlp.model.MaxentModel;
+import opennlp.tools.coref.resolver.ResolverUtils;
+import opennlp.tools.util.CollectionEventStream;
+import opennlp.tools.util.HashList;
+
+/**
+ * Class which models the number of particular mentions and the entities made up of mentions.
+ */
+public class NumberModel implements TestNumberModel, TrainSimilarityModel {
+
+ private String modelName;
+ private String modelExtension = ".bin.gz";
+ private MaxentModel testModel;
+ private List<Event> events;
+
+ private int singularIndex;
+ private int pluralIndex;
+
+ public static TestNumberModel testModel(String name) throws IOException {
+ NumberModel nm = new NumberModel(name, false);
+ return nm;
+ }
+
+ public static TrainSimilarityModel trainModel(String modelName) throws IOException {
+ NumberModel gm = new NumberModel(modelName, true);
+ return gm;
+ }
+
+ private NumberModel(String modelName, boolean train) throws IOException {
+ this.modelName = modelName;
+ if (train) {
+ events = new ArrayList<Event>();
+ }
+ else {
+ //if (MaxentResolver.loadAsResource()) {
+ // testModel = (new PlainTextGISModelReader(new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(modelName))))).getModel();
+ //}
+ testModel = (new SuffixSensitiveGISModelReader(new File(modelName+modelExtension))).getModel();
+ singularIndex = testModel.getIndex(NumberEnum.SINGULAR.toString());
+ pluralIndex = testModel.getIndex(NumberEnum.PLURAL.toString());
+ }
+ }
+
+ private List<String> getFeatures(Context np1) {
+ List<String> features = new ArrayList<String>();
+ features.add("default");
+ Object[] npTokens = np1.getTokens();
+ for (int ti = 0, tl = npTokens.length - 1; ti < tl; ti++) {
+ features.add("mw=" + npTokens[ti].toString());
+ }
+ features.add("hw=" + np1.getHeadTokenText().toLowerCase());
+ features.add("ht=" + np1.getHeadTokenTag());
+ return features;
+ }
+
+ private void addEvent(String outcome, Context np1) {
+ List<String> feats = getFeatures(np1);
+ events.add(new Event(outcome, feats.toArray(new String[feats.size()])));
+ }
+
+ public NumberEnum getNumber(Context ec) {
+ if (ResolverUtils.singularPronounPattern.matcher(ec.getHeadTokenText()).matches()) {
+ return NumberEnum.SINGULAR;
+ }
+ else if (ResolverUtils.pluralPronounPattern.matcher(ec.getHeadTokenText()).matches()) {
+ return NumberEnum.PLURAL;
+ }
+ else {
+ return NumberEnum.UNKNOWN;
+ }
+ }
+
+ private NumberEnum getNumber(List<Context> entity) {
+ for (Iterator<Context> ci = entity.iterator(); ci.hasNext();) {
+ Context ec = ci.next();
+ NumberEnum ne = getNumber(ec);
+ if (ne != NumberEnum.UNKNOWN) {
+ return ne;
+ }
+ }
+ return NumberEnum.UNKNOWN;
+ }
+
+ @SuppressWarnings("unchecked")
+ public void setExtents(Context[] extentContexts) {
+ HashList entities = new HashList();
+ List<Context> singletons = new ArrayList<Context>();
+ for (int ei = 0, el = extentContexts.length; ei < el; ei++) {
+ Context ec = extentContexts[ei];
+ //System.err.println("NumberModel.setExtents: ec("+ec.getId()+") "+ec.toText());
+ if (ec.getId() != -1) {
+ entities.put(ec.getId(), ec);
+ }
+ else {
+ singletons.add(ec);
+ }
+ }
+ List<Context> singles = new ArrayList<Context>();
+ List<Context> plurals = new ArrayList<Context>();
+ // coref entities
+ for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) {
+ Integer key = ei.next();
+ List<Context> entityContexts = (List<Context>) entities.get(key);
+ NumberEnum number = getNumber(entityContexts);
+ if (number == NumberEnum.SINGULAR) {
+ singles.addAll(entityContexts);
+ }
+ else if (number == NumberEnum.PLURAL) {
+ plurals.addAll(entityContexts);
+ }
+ }
+ // non-coref entities.
+ for (Iterator<Context> ei = singletons.iterator(); ei.hasNext();) {
+ Context ec = ei.next();
+ NumberEnum number = getNumber(ec);
+ if (number == NumberEnum.SINGULAR) {
+ singles.add(ec);
+ }
+ else if (number == NumberEnum.PLURAL) {
+ plurals.add(ec);
+ }
+ }
+
+ for (Iterator<Context> si = singles.iterator(); si.hasNext();) {
+ Context ec = si.next();
+ addEvent(NumberEnum.SINGULAR.toString(), ec);
+ }
+ for (Iterator<Context> fi = plurals.iterator(); fi.hasNext();) {
+ Context ec = fi.next();
+ addEvent(NumberEnum.PLURAL.toString(),ec);
+ }
+ }
+
+ public double[] numberDist(Context c) {
+ List<String> feats = getFeatures(c);
+ return testModel.eval(feats.toArray(new String[feats.size()]));
+ }
+
+ public int getSingularIndex() {
+ return singularIndex;
+ }
+
+ public int getPluralIndex() {
+ return pluralIndex;
+ }
+
+ public void trainModel() throws IOException {
+ (new SuffixSensitiveGISModelWriter(GIS.trainModel(new CollectionEventStream(events),100,10),new File(modelName+modelExtension))).persist();
+ }
+
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SemanticCompatibility.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SemanticCompatibility.java
new file mode 100644
index 0000000..fa84387
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SemanticCompatibility.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+/**
+ * Class which models the semantic compatibility of an enity and the confidence of that association.
+ */
+public class SemanticCompatibility {
+
+ private SemanticEnum type;
+ private double confidence;
+
+ public SemanticCompatibility(SemanticEnum type,double confidence) {
+ this.type = type;
+ this.confidence = confidence;
+ }
+
+ public SemanticEnum getType() {
+ return type;
+ }
+
+ public double getConfidence() {
+ return confidence;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SemanticEnum.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SemanticEnum.java
new file mode 100644
index 0000000..568ab1d
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SemanticEnum.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+public class SemanticEnum {
+
+ private String compatibility;
+
+ /** Semantically compatible. */
+ public static final SemanticEnum COMPATIBLE = new SemanticEnum("compatible");
+ /** Semantically incompatible. */
+ public static final SemanticEnum INCOMPATIBLE = new SemanticEnum("incompatible");
+ /** Semantic compatibility Unknown. */
+ public static final SemanticEnum UNKNOWN = new SemanticEnum("unknown");
+
+ private SemanticEnum(String g) {
+ compatibility = g;
+ }
+
+ @Override
+ public String toString() {
+ return compatibility;
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SimilarityModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SimilarityModel.java
new file mode 100644
index 0000000..0ea01a0
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SimilarityModel.java
@@ -0,0 +1,635 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.maxent.GIS;
+import opennlp.maxent.io.SuffixSensitiveGISModelReader;
+import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
+import opennlp.model.Event;
+import opennlp.model.MaxentModel;
+import opennlp.tools.coref.resolver.ResolverUtils;
+import opennlp.tools.util.CollectionEventStream;
+import opennlp.tools.util.HashList;
+
+/**
+ * Models semantic similarity between two mentions and returns a score based on
+ * how semantically comparable the mentions are with one another.
+ */
+public class SimilarityModel implements TestSimilarityModel, TrainSimilarityModel {
+
+ private String modelName;
+ private String modelExtension = ".bin.gz";
+ private MaxentModel testModel;
+ private List<Event> events;
+ private int SAME_INDEX;
+ private static final String SAME = "same";
+ private static final String DIFF = "diff";
+ private boolean debugOn = false;
+
+ public static TestSimilarityModel testModel(String name) throws IOException {
+ return new SimilarityModel(name, false);
+ }
+
+ public static TrainSimilarityModel trainModel(String name) throws IOException {
+ SimilarityModel sm = new SimilarityModel(name, true);
+ return sm;
+ }
+
+ private SimilarityModel(String modelName, boolean train) throws IOException {
+ this.modelName = modelName;
+ if (train) {
+ events = new ArrayList<Event>();
+ }
+ else {
+ testModel = (new SuffixSensitiveGISModelReader(new File(modelName+modelExtension))).getModel();
+ SAME_INDEX = testModel.getIndex(SAME);
+ }
+ }
+
+ private void addEvent(boolean same, Context np1, Context np2) {
+ if (same) {
+ List<String> feats = getFeatures(np1, np2);
+ //System.err.println(SAME+" "+np1.headTokenText+" ("+np1.id+") -> "+np2.headTokenText+" ("+np2.id+") "+feats);
+ events.add(new Event(SAME, feats.toArray(new String[feats.size()])));
+ }
+ else {
+ List<String> feats = getFeatures(np1, np2);
+ //System.err.println(DIFF+" "+np1.headTokenText+" ("+np1.id+") -> "+np2.headTokenText+" ("+np2.id+") "+feats);
+ events.add(new Event(DIFF, feats.toArray(new String[feats.size()])));
+ }
+ }
+
+ /**
+ * Produces a set of head words for the specified list of mentions.
+ *
+ * @param mentions The mentions to use to construct the
+ *
+ * @return A set containing the head words of the specified mentions.
+ */
+ private Set<String> constructHeadSet(List<Context> mentions) {
+ Set<String> headSet = new HashSet<String>();
+ for (Iterator<Context> ei = mentions.iterator(); ei.hasNext();) {
+ Context ec = ei.next();
+ headSet.add(ec.getHeadTokenText().toLowerCase());
+ }
+ return headSet;
+ }
+
+ private boolean hasSameHead(Set<String> entityHeadSet, Set<String> candidateHeadSet) {
+ for (Iterator<String> hi = entityHeadSet.iterator(); hi.hasNext();) {
+ if (candidateHeadSet.contains(hi.next())) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private boolean hasSameNameType(Set<String> entityNameSet, Set<String> candidateNameSet) {
+ for (Iterator<String> hi = entityNameSet.iterator(); hi.hasNext();) {
+ if (candidateNameSet.contains(hi.next())) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private boolean hasSuperClass(List<Context> entityContexts, List<Context> candidateContexts) {
+ for (Iterator<Context> ei = entityContexts.iterator(); ei.hasNext();) {
+ Context ec = ei.next();
+ for (Iterator<Context> cei = candidateContexts.iterator(); cei.hasNext();) {
+ if (inSuperClass(ec, cei.next())) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Constructs a set of entities which may be semantically compatible with the
+ * entity indicated by the specified entityKey.
+ *
+ * @param entityKey The key of the entity for which the set is being constructed.
+ * @param entities A mapping between entity keys and their mentions.
+ * @param headSets A mapping between entity keys and their head sets.
+ * @param nameSets A mapping between entity keys and their name sets.
+ * @param singletons A list of all entities which consists of a single mentions.
+ *
+ * @return A set of mentions for all the entities which might be semantically compatible
+ * with entity indicated by the specified key.
+ */
+ @SuppressWarnings("unchecked")
+ private Set<Context> constructExclusionSet(Integer entityKey, HashList entities, Map<Integer, Set<String>> headSets, Map<Integer, Set<String>> nameSets, List<Context> singletons) {
+ Set<Context> exclusionSet = new HashSet<Context>();
+ Set<String> entityHeadSet = headSets.get(entityKey);
+ Set<String> entityNameSet = nameSets.get(entityKey);
+ List<Context> entityContexts = (List<Context>) entities.get(entityKey);
+ //entities
+ for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) {
+ Integer key = ei.next();
+ List<Context> candidateContexts = (List<Context>) entities.get(key);
+ if (key.equals(entityKey)) {
+ exclusionSet.addAll(candidateContexts);
+ }
+ else if (nameSets.get(key).isEmpty()) {
+ exclusionSet.addAll(candidateContexts);
+ }
+ else if (hasSameHead(entityHeadSet, headSets.get(key))) {
+ exclusionSet.addAll(candidateContexts);
+ }
+ else if (hasSameNameType(entityNameSet, nameSets.get(key))) {
+ exclusionSet.addAll(candidateContexts);
+ }
+ else if (hasSuperClass(entityContexts, candidateContexts)) {
+ exclusionSet.addAll(candidateContexts);
+ }
+ }
+ //singles
+ List<Context> singles = new ArrayList<Context>(1);
+ for (Iterator<Context> si = singletons.iterator(); si.hasNext();) {
+ Context sc = si.next();
+ singles.clear();
+ singles.add(sc);
+ if (entityHeadSet.contains(sc.getHeadTokenText().toLowerCase())) {
+ exclusionSet.add(sc);
+ }
+ else if (sc.getNameType() == null) {
+ exclusionSet.add(sc);
+ }
+ else if (entityNameSet.contains(sc.getNameType())) {
+ exclusionSet.add(sc);
+ }
+ else if (hasSuperClass(entityContexts, singles)) {
+ exclusionSet.add(sc);
+ }
+ }
+ return exclusionSet;
+ }
+
+ /**
+ * Constructs a mapping between the specified entities and their head set.
+ *
+ * @param entities Mapping between a key and a list of mentions which compose an entity.
+ *
+ * @return a mapping between the keys of the specified entity mapping and the head set
+ * generated from the mentions associated with that key.
+ */
+ @SuppressWarnings("unchecked")
+ private Map<Integer, Set<String>> constructHeadSets(HashList entities) {
+ Map<Integer, Set<String>> headSets = new HashMap<Integer, Set<String>>();
+ for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) {
+ Integer key = ei.next();
+ List<Context> entityContexts = (List<Context>) entities.get(key);
+ headSets.put(key, constructHeadSet(entityContexts));
+ }
+ return headSets;
+ }
+
+ /**
+ * Produces the set of name types associated with each of the specified mentions.
+ *
+ * @param mentions A list of mentions.
+ *
+ * @return A set set of name types assigned to the specified mentions.
+ */
+ private Set<String> constructNameSet(List<Context> mentions) {
+ Set<String> nameSet = new HashSet<String>();
+ for (Iterator<Context> ei = mentions.iterator(); ei.hasNext();) {
+ Context ec = ei.next();
+ if (ec.getNameType() != null) {
+ nameSet.add(ec.getNameType());
+ }
+ }
+ return nameSet;
+ }
+
+ /**
+ * Constructs a mapping between the specified entities and the names associated with these entities.
+ *
+ * @param entities A mapping between a key and a list of mentions.
+ *
+ * @return a mapping between each key in the specified entity map and the name types associated with the each mention of that entity.
+ */
+ @SuppressWarnings("unchecked")
+ private Map<Integer, Set<String>> constructNameSets(HashList entities) {
+ Map<Integer, Set<String>> nameSets = new HashMap<Integer, Set<String>>();
+ for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) {
+ Integer key = ei.next();
+ List<Context> entityContexts = (List<Context>) entities.get(key);
+ nameSets.put(key, constructNameSet(entityContexts));
+ }
+ return nameSets;
+ }
+
+ private boolean inSuperClass(Context ec, Context cec) {
+ if (ec.getSynsets().size() == 0 || cec.getSynsets().size() == 0) {
+ return false;
+ }
+ else {
+ int numCommonSynsets = 0;
+ for (Iterator<String> si = ec.getSynsets().iterator(); si.hasNext();) {
+ String synset = si.next();
+ if (cec.getSynsets().contains(synset)) {
+ numCommonSynsets++;
+ }
+ }
+ if (numCommonSynsets == 0) {
+ return false;
+ }
+ else if (numCommonSynsets == ec.getSynsets().size() || numCommonSynsets == cec.getSynsets().size()) {
+ return true;
+ }
+ else {
+ return false;
+ }
+ }
+ }
+
+ /*
+ private boolean isPronoun(MentionContext mention) {
+ return mention.getHeadTokenTag().startsWith("PRP");
+ }
+ */
+
+ @SuppressWarnings("unchecked")
+ public void setExtents(Context[] extentContexts) {
+ HashList entities = new HashList();
+ /** Extents which are not in a coreference chain. */
+ List<Context> singletons = new ArrayList<Context>();
+ List<Context> allExtents = new ArrayList<Context>();
+ //populate data structures
+ for (int ei = 0, el = extentContexts.length; ei < el; ei++) {
+ Context ec = extentContexts[ei];
+ //System.err.println("SimilarityModel: setExtents: ec("+ec.getId()+") "+ec.getNameType()+" "+ec);
+ if (ec.getId() == -1) {
+ singletons.add(ec);
+ }
+ else {
+ entities.put(ec.getId(), ec);
+ }
+ allExtents.add(ec);
+ }
+
+ int axi = 0;
+ Map<Integer, Set<String>> headSets = constructHeadSets(entities);
+ Map<Integer, Set<String>> nameSets = constructNameSets(entities);
+
+ for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) {
+ Integer key = ei.next();
+ Set<String> entityNameSet = nameSets.get(key);
+ if (entityNameSet.isEmpty()) {
+ continue;
+ }
+ List<Context> entityContexts = (List<Context>) entities.get(key);
+ Set<Context> exclusionSet = constructExclusionSet(key, entities, headSets, nameSets, singletons);
+ if (entityContexts.size() == 1) {
+ }
+ for (int xi1 = 0, xl = entityContexts.size(); xi1 < xl; xi1++) {
+ Context ec1 = entityContexts.get(xi1);
+ //if (isPronoun(ec1)) {
+ // continue;
+ //}
+ for (int xi2 = xi1 + 1; xi2 < xl; xi2++) {
+ Context ec2 = entityContexts.get(xi2);
+ //if (isPronoun(ec2)) {
+ // continue;
+ //}
+ addEvent(true, ec1, ec2);
+ int startIndex = axi;
+ do {
+ Context sec1 = allExtents.get(axi);
+ axi = (axi + 1) % allExtents.size();
+ if (!exclusionSet.contains(sec1)) {
+ if (debugOn) System.err.println(ec1.toString()+" "+entityNameSet+" "+sec1.toString()+" "+nameSets.get(sec1.getId()));
+ addEvent(false, ec1, sec1);
+ break;
+ }
+ }
+ while (axi != startIndex);
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns a number between 0 and 1 which represents the models belief that the specified mentions are compatible.
+ * Value closer to 1 are more compatible, while values closer to 0 are less compatible.
+ * @param mention1 The first mention to be considered.
+ * @param mention2 The second mention to be considered.
+ * @return a number between 0 and 1 which represents the models belief that the specified mentions are compatible.
+ */
+ public double compatible(Context mention1, Context mention2) {
+ List<String> feats = getFeatures(mention1, mention2);
+ if (debugOn) System.err.println("SimilarityModel.compatible: feats="+feats);
+ return (testModel.eval(feats.toArray(new String[feats.size()]))[SAME_INDEX]);
+ }
+
+ /**
+ * Train a model based on the previously supplied evidence.
+ * @see #setExtents(Context[])
+ */
+ public void trainModel() throws IOException {
+ if (debugOn) {
+ FileWriter writer = new FileWriter(modelName+".events");
+ for (Iterator<Event> ei=events.iterator();ei.hasNext();) {
+ Event e = ei.next();
+ writer.write(e.toString()+"\n");
+ }
+ writer.close();
+ }
+ (new SuffixSensitiveGISModelWriter(GIS.trainModel(
+ new CollectionEventStream(events),100,10),
+ new File(modelName+modelExtension))).persist();
+ }
+
+ private boolean isName(Context np) {
+ return np.getHeadTokenTag().startsWith("NNP");
+ }
+
+ private boolean isCommonNoun(Context np) {
+ return !np.getHeadTokenTag().startsWith("NNP") && np.getHeadTokenTag().startsWith("NN");
+ }
+
+ private boolean isPronoun(Context np) {
+ return np.getHeadTokenTag().startsWith("PRP");
+ }
+
+ private boolean isNumber(Context np) {
+ return np.getHeadTokenTag().equals("CD");
+ }
+
+ private List<String> getNameCommonFeatures(Context name, Context common) {
+ Set<String> synsets = common.getSynsets();
+ List<String> features = new ArrayList<String>(2 + synsets.size());
+ features.add("nn=" + name.getNameType() + "," + common.getNameType());
+ features.add("nw=" + name.getNameType() + "," + common.getHeadTokenText().toLowerCase());
+ for (Iterator<String> si = synsets.iterator(); si.hasNext();) {
+ features.add("ns=" + name.getNameType() + "," + si.next());
+ }
+ if (name.getNameType() == null) {
+ //features.addAll(getCommonCommonFeatures(name,common));
+ }
+ return features;
+ }
+
+ private List<String> getNameNumberFeatures(Context name, Context number) {
+ List<String> features = new ArrayList<String>(2);
+ features.add("nt=" + name.getNameType() + "," + number.getHeadTokenTag());
+ features.add("nn=" + name.getNameType() + "," + number.getNameType());
+ return features;
+ }
+
+ private List<String> getNamePronounFeatures(Context name, Context pronoun) {
+ List<String> features = new ArrayList<String>(2);
+ features.add("nw=" + name.getNameType() + "," + pronoun.getHeadTokenText().toLowerCase());
+ features.add("ng=" + name.getNameType() + "," + ResolverUtils.getPronounGender(
+ pronoun.getHeadTokenText().toLowerCase()));
+ return features;
+ }
+
+ private List<String> getCommonPronounFeatures(Context common, Context pronoun) {
+ List<String> features = new ArrayList<String>();
+ Set<String> synsets1 = common.getSynsets();
+ String p = pronoun.getHeadTokenText().toLowerCase();
+ String gen = ResolverUtils.getPronounGender(p);
+ features.add("wn=" + p + "," + common.getNameType());
+ for (Iterator<String> si = synsets1.iterator(); si.hasNext();) {
+ String synset = si.next();
+ features.add("ws=" + p + "," + synset);
+ features.add("gs=" + gen + "," + synset);
+ }
+ return features;
+ }
+
+ private List<String> getCommonNumberFeatures(Context common, Context number) {
+ List<String> features = new ArrayList<String>();
+ Set<String> synsets1 = common.getSynsets();
+ for (Iterator<String> si = synsets1.iterator(); si.hasNext();) {
+ String synset = si.next();
+ features.add("ts=" + number.getHeadTokenTag() + "," + synset);
+ features.add("ns=" + number.getNameType() + "," + synset);
+ }
+ features.add("nn=" + number.getNameType() + "," + common.getNameType());
+ return features;
+ }
+
+ private List<String> getNumberPronounFeatures(Context number, Context pronoun) {
+ List<String> features = new ArrayList<String>();
+ String p = pronoun.getHeadTokenText().toLowerCase();
+ String gen = ResolverUtils.getPronounGender(p);
+ features.add("wt=" + p + "," + number.getHeadTokenTag());
+ features.add("wn=" + p + "," + number.getNameType());
+ features.add("wt=" + gen + "," + number.getHeadTokenTag());
+ features.add("wn=" + gen + "," + number.getNameType());
+ return features;
+ }
+
+ private List<String> getNameNameFeatures(Context name1, Context name2) {
+ List<String> features = new ArrayList<String>(1);
+ if (name1.getNameType() == null && name2.getNameType() == null) {
+ features.add("nn=" + name1.getNameType() + "," + name2.getNameType());
+ //features.addAll(getCommonCommonFeatures(name1,name2));
+ }
+ else if (name1.getNameType() == null) {
+ features.add("nn=" + name1.getNameType() + "," + name2.getNameType());
+ //features.addAll(getNameCommonFeatures(name2,name1));
+ }
+ else if (name2.getNameType() == null) {
+ features.add("nn=" + name2.getNameType() + "," + name1.getNameType());
+ //features.addAll(getNameCommonFeatures(name1,name2));
+ }
+ else {
+ if (name1.getNameType().compareTo(name2.getNameType()) < 0) {
+ features.add("nn=" + name1.getNameType() + "," + name2.getNameType());
+ }
+ else {
+ features.add("nn=" + name2.getNameType() + "," + name1.getNameType());
+ }
+ if (name1.getNameType().equals(name2.getNameType())) {
+ features.add("sameNameType");
+ }
+ }
+ return features;
+ }
+
+ private List<String> getCommonCommonFeatures(Context common1, Context common2) {
+ List<String> features = new ArrayList<String>();
+ Set<String> synsets1 = common1.getSynsets();
+ Set<String> synsets2 = common2.getSynsets();
+
+ if (synsets1.size() == 0) {
+ //features.add("missing_"+common1.headToken);
+ return features;
+ }
+ if (synsets2.size() == 0) {
+ //features.add("missing_"+common2.headToken);
+ return features;
+ }
+ int numCommonSynsets = 0;
+ for (Iterator<String> si = synsets1.iterator(); si.hasNext();) {
+ String synset = si.next();
+ if (synsets2.contains(synset)) {
+ features.add("ss=" + synset);
+ numCommonSynsets++;
+ }
+ }
+ if (numCommonSynsets == 0) {
+ features.add("ncss");
+ }
+ else if (numCommonSynsets == synsets1.size() && numCommonSynsets == synsets2.size()) {
+ features.add("samess");
+ }
+ else if (numCommonSynsets == synsets1.size()) {
+ features.add("2isa1");
+ //features.add("2isa1-"+(synsets2.size() - numCommonSynsets));
+ }
+ else if (numCommonSynsets == synsets2.size()) {
+ features.add("1isa2");
+ //features.add("1isa2-"+(synsets1.size() - numCommonSynsets));
+ }
+ return features;
+ }
+
+ private List<String> getPronounPronounFeatures(Context pronoun1, Context pronoun2) {
+ List<String> features = new ArrayList<String>();
+ String g1 = ResolverUtils.getPronounGender(pronoun1.getHeadTokenText());
+ String g2 = ResolverUtils.getPronounGender(pronoun2.getHeadTokenText());
+ if (g1.equals(g2)) {
+ features.add("sameGender");
+ }
+ else {
+ features.add("diffGender");
+ }
+ return features;
+ }
+
+ private List<String> getFeatures(Context np1, Context np2) {
+ List<String> features = new ArrayList<String>();
+ features.add("default");
+ // semantic categories
+ String w1 = np1.getHeadTokenText().toLowerCase();
+ String w2 = np2.getHeadTokenText().toLowerCase();
+ if (w1.compareTo(w2) < 0) {
+ features.add("ww=" + w1 + "," + w2);
+ }
+ else {
+ features.add("ww=" + w2 + "," + w1);
+ }
+ if (w1.equals(w2)) {
+ features.add("sameHead");
+ }
+ //features.add("tt="+np1.headTag+","+np2.headTag);
+ if (isName(np1)) {
+ if (isName(np2)) {
+ features.addAll(getNameNameFeatures(np1, np2));
+ }
+ else if (isCommonNoun(np2)) {
+ features.addAll(getNameCommonFeatures(np1, np2));
+ }
+ else if (isPronoun(np2)) {
+ features.addAll(getNamePronounFeatures(np1, np2));
+ }
+ else if (isNumber(np2)) {
+ features.addAll(getNameNumberFeatures(np1, np2));
+ }
+ }
+ else if (isCommonNoun(np1)) {
+ if (isName(np2)) {
+ features.addAll(getNameCommonFeatures(np2, np1));
+ }
+ else if (isCommonNoun(np2)) {
+ features.addAll(getCommonCommonFeatures(np1, np2));
+ }
+ else if (isPronoun(np2)) {
+ features.addAll(getCommonPronounFeatures(np1, np2));
+ }
+ else if (isNumber(np2)) {
+ features.addAll(getCommonNumberFeatures(np1, np2));
+ }
+ else {
+ //System.err.println("unknown group for " + np1.headTokenText + " -> " + np2.headTokenText);
+ }
+ }
+ else if (isPronoun(np1)) {
+ if (isName(np2)) {
+ features.addAll(getNamePronounFeatures(np2, np1));
+ }
+ else if (isCommonNoun(np2)) {
+ features.addAll(getCommonPronounFeatures(np2, np1));
+ }
+ else if (isPronoun(np2)) {
+ features.addAll(getPronounPronounFeatures(np1, np2));
+ }
+ else if (isNumber(np2)) {
+ features.addAll(getNumberPronounFeatures(np2, np1));
+ }
+ else {
+ //System.err.println("unknown group for " + np1.headTokenText + " -> " + np2.headTokenText);
+ }
+ }
+ else if (isNumber(np1)) {
+ if (isName(np2)) {
+ features.addAll(getNameNumberFeatures(np2, np1));
+ }
+ else if (isCommonNoun(np2)) {
+ features.addAll(getCommonNumberFeatures(np2, np1));
+ }
+ else if (isPronoun(np2)) {
+ features.addAll(getNumberPronounFeatures(np1, np2));
+ }
+ else if (isNumber(np2)) {}
+ else {
+ //System.err.println("unknown group for " + np1.headTokenText + " -> " + np2.headTokenText);
+ }
+ }
+ else {
+ //System.err.println("unknown group for " + np1.headToken);
+ }
+ return (features);
+ }
+
+ public static void main(String[] args) throws IOException {
+ if (args.length == 0) {
+ System.err.println("Usage: SimilarityModel modelName < tiger/NN bear/NN");
+ System.exit(1);
+ }
+ String modelName = args[0];
+ SimilarityModel model = new SimilarityModel(modelName, false);
+ //Context.wn = new WordNet(System.getProperty("WNHOME"), true);
+ //Context.morphy = new Morphy(Context.wn);
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ for (String line = in.readLine(); line != null; line = in.readLine()) {
+ String[] words = line.split(" ");
+ double p = model.compatible(Context.parseContext(words[0]), Context.parseContext(words[1]));
+ System.out.println(p + " " + model.getFeatures(Context.parseContext(words[0]), Context.parseContext(words[1])));
+ }
+ }
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestGenderModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestGenderModel.java
new file mode 100644
index 0000000..85af05f
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestGenderModel.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+/**
+ * Interface for testing a gender model.
+ */
+public interface TestGenderModel {
+ public double[] genderDistribution(Context np1);
+ public int getMaleIndex();
+ public int getFemaleIndex();
+ public int getNeuterIndex();
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestNumberModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestNumberModel.java
new file mode 100644
index 0000000..6172fe5
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestNumberModel.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+/**
+ * Interface for testing a number model.
+ *
+ */
+public interface TestNumberModel {
+ public double[] numberDist(Context np1);
+
+ public int getSingularIndex();
+ public int getPluralIndex();
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestSimilarityModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestSimilarityModel.java
new file mode 100644
index 0000000..8bc9fa8
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TestSimilarityModel.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+
+/**
+ * Interface for testing a similarity model.
+ */
+public interface TestSimilarityModel {
+ public double compatible(Context np1, Context np2);
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TrainSimilarityModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TrainSimilarityModel.java
new file mode 100644
index 0000000..1704013
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/TrainSimilarityModel.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.coref.sim;
+
+import java.io.IOException;
+
+/**
+ * Interface for training a similarity, gender, or number model.
+ */
+public interface TrainSimilarityModel {
+ public void trainModel() throws IOException;
+ /**
+ * Creates simialrity training pairs based on the specified extents.
+ * Extents are considered compatible is they are in the same coreference chain,
+ * have the same named-entity tag, or share a common head word. Incompatible extents are chosen at random
+ * from the set of extents which don't meet this criteria.
+ * @param extents
+ */
+ public void setExtents(Context[] extents);
+}
diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/package-info.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/package-info.java
new file mode 100644
index 0000000..535211a
--- /dev/null
+++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Package related to the modeling mention similarity for coreference resolution.
+ */
+package opennlp.tools.coref.sim;
\ No newline at end of file