blob: 89f4b79172d8d954eae59b8d10664c4013cfa0fa [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.coref.resolver;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import opennlp.tools.coref.DiscourseEntity;
import opennlp.tools.coref.mention.MentionContext;
/**
* Resolves coreference between proper nouns.
*/
public class ProperNounResolver extends MaxentResolver {
private static Map<String, Set<String>> acroMap;
private static boolean acroMapLoaded = false;
public ProperNounResolver(String projectName, ResolverMode m) throws IOException {
super(projectName,"pnmodel", m, 500);
if (!acroMapLoaded) {
initAcronyms(projectName + "/acronyms");
acroMapLoaded = true;
}
showExclusions = false;
}
public ProperNounResolver(String projectName, ResolverMode m,NonReferentialResolver nonRefResolver)
throws IOException {
super(projectName,"pnmodel", m, 500,nonRefResolver);
if (!acroMapLoaded) {
initAcronyms(projectName + "/acronyms");
acroMapLoaded = true;
}
showExclusions = false;
}
public boolean canResolve(MentionContext mention) {
return (mention.getHeadTokenTag().startsWith("NNP") || mention.getHeadTokenTag().startsWith("CD"));
}
private void initAcronyms(String name) {
acroMap = new HashMap<>(15000);
try {
BufferedReader str;
str = new BufferedReader(new FileReader(name));
//System.err.println("Reading acronyms database: " + file + " ");
String line;
while (null != (line = str.readLine())) {
StringTokenizer st = new StringTokenizer(line, "\t");
String acro = st.nextToken();
String full = st.nextToken();
Set<String> exSet = acroMap.get(acro);
if (exSet == null) {
exSet = new HashSet<>();
acroMap.put(acro, exSet);
}
exSet.add(full);
exSet = acroMap.get(full);
if (exSet == null) {
exSet = new HashSet<>();
acroMap.put(full, exSet);
}
exSet.add(acro);
}
}
catch (IOException e) {
System.err.println("ProperNounResolver.initAcronyms: Acronym Database not found: " + e);
}
}
private boolean isAcronym(String ecStrip, String xecStrip) {
Set<String> exSet = acroMap.get(ecStrip);
if (exSet != null && exSet.contains(xecStrip)) {
return true;
}
return false;
}
protected List<String> getAcronymFeatures(MentionContext mention, DiscourseEntity entity) {
MentionContext xec = ResolverUtils.getProperNounExtent(entity);
String ecStrip = ResolverUtils.stripNp(mention);
String xecStrip = ResolverUtils.stripNp(xec);
if (ecStrip != null && xecStrip != null) {
if (isAcronym(ecStrip, xecStrip)) {
List<String> features = new ArrayList<>(1);
features.add("knownAcronym");
return features;
}
}
return Collections.emptyList();
}
@Override
protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
//System.err.println("ProperNounResolver.getFeatures: "+mention.toText()+" -> "+entity);
List<String> features = new ArrayList<>(super.getFeatures(mention, entity));
if (entity != null) {
features.addAll(ResolverUtils.getStringMatchFeatures(mention, entity));
features.addAll(getAcronymFeatures(mention, entity));
}
return features;
}
@Override
public boolean excluded(MentionContext mention, DiscourseEntity entity) {
if (super.excluded(mention, entity)) {
return true;
}
for (Iterator<MentionContext> ei = entity.getMentions(); ei.hasNext();) {
MentionContext xec = ei.next();
if (xec.getHeadTokenTag().startsWith("NNP")) {
// || initialCaps.matcher(xec.headToken.toString()).find()) {
//System.err.println("MaxentProperNounResolver.exclude: kept "+xec.toText()+" with "+xec.headTag);
return false;
}
}
return true;
}
}