blob: 37629d30c2113cf80154b84a911d6b7d7e055d8f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.coref.resolver;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.coref.DiscourseEntity;
import opennlp.tools.coref.mention.MentionContext;
/**
* Resolves coreference between appositives.
*/
public class IsAResolver extends MaxentResolver {
Pattern predicativePattern;
public IsAResolver(String projectName, ResolverMode m) throws IOException {
super(projectName, "/imodel", m, 20);
showExclusions = false;
//predicativePattern = Pattern.compile("^(,|am|are|is|was|were|--)$");
predicativePattern = Pattern.compile("^(,|--)$");
}
public IsAResolver(String projectName, ResolverMode m, NonReferentialResolver nrr) throws IOException {
super(projectName, "/imodel", m, 20,nrr);
showExclusions = false;
//predicativePattern = Pattern.compile("^(,|am|are|is|was|were|--)$");
predicativePattern = Pattern.compile("^(,|--)$");
}
public boolean canResolve(MentionContext ec) {
if (ec.getHeadTokenTag().startsWith("NN")) {
return (ec.getPreviousToken() != null && predicativePattern.matcher(ec.getPreviousToken().toString()).matches());
}
return false;
}
@Override
protected boolean excluded(MentionContext ec, DiscourseEntity de) {
MentionContext cec = de.getLastExtent();
//System.err.println("IsAResolver.excluded?: ec.span="+ec.getSpan()+" cec.span="+cec.getSpan()+" cec="+cec.toText()+" lastToken="+ec.getNextToken());
if (ec.getSentenceNumber() != cec.getSentenceNumber()) {
//System.err.println("IsAResolver.excluded: (true) not same sentence");
return (true);
}
//shallow parse appositives
//System.err.println("IsAResolver.excluded: ec="+ec.toText()+" "+ec.span+" cec="+cec.toText()+" "+cec.span);
if (cec.getIndexSpan().getEnd() == ec.getIndexSpan().getStart() - 2) {
return (false);
}
//full parse w/o trailing comma
if (cec.getIndexSpan().getEnd() == ec.getIndexSpan().getEnd()) {
//System.err.println("IsAResolver.excluded: (false) spans share end");
return (false);
}
//full parse w/ trailing comma or period
if (cec.getIndexSpan().getEnd() <= ec.getIndexSpan().getEnd() + 2 && (ec.getNextToken() != null && (ec.getNextToken().toString().equals(",") || ec.getNextToken().toString().equals(".")))) {
//System.err.println("IsAResolver.excluded: (false) spans end + punct");
return (false);
}
//System.err.println("IsAResolver.excluded: (true) default");
return (true);
}
@Override
protected boolean outOfRange(MentionContext ec, DiscourseEntity de) {
MentionContext cec = de.getLastExtent();
return (cec.getSentenceNumber() != ec.getSentenceNumber());
}
@Override
protected boolean defaultReferent(DiscourseEntity de) {
return (true);
}
@Override
protected List<String> getFeatures(MentionContext mention, DiscourseEntity entity) {
List<String> features = new ArrayList<String>();
features.addAll(super.getFeatures(mention, entity));
if (entity != null) {
MentionContext ant = entity.getLastExtent();
List<String> leftContexts = ResolverUtils.getContextFeatures(ant);
for (int ci = 0, cn = leftContexts.size(); ci < cn; ci++) {
features.add("l" + leftContexts.get(ci));
}
List<String> rightContexts = ResolverUtils.getContextFeatures(mention);
for (int ci = 0, cn = rightContexts.size(); ci < cn; ci++) {
features.add("r" + rightContexts.get(ci));
}
features.add("hts"+ant.getHeadTokenTag()+","+mention.getHeadTokenTag());
}
/*
if (entity != null) {
//System.err.println("MaxentIsResolver.getFeatures: ["+ec2.toText()+"] -> ["+de.getLastExtent().toText()+"]");
//previous word and tag
if (ant.prevToken != null) {
features.add("pw=" + ant.prevToken);
features.add("pt=" + ant.prevToken.getSyntacticType());
}
else {
features.add("pw=<none>");
features.add("pt=<none>");
}
//next word and tag
if (mention.nextToken != null) {
features.add("nw=" + mention.nextToken);
features.add("nt=" + mention.nextToken.getSyntacticType());
}
else {
features.add("nw=<none>");
features.add("nt=<none>");
}
//modifier word and tag for c1
int i = 0;
List c1toks = ant.tokens;
for (; i < ant.headTokenIndex; i++) {
features.add("mw=" + c1toks.get(i));
features.add("mt=" + ((Parse) c1toks.get(i)).getSyntacticType());
}
//head word and tag for c1
features.add("mh=" + c1toks.get(i));
features.add("mt=" + ((Parse) c1toks.get(i)).getSyntacticType());
//modifier word and tag for c2
i = 0;
List c2toks = mention.tokens;
for (; i < mention.headTokenIndex; i++) {
features.add("mw=" + c2toks.get(i));
features.add("mt=" + ((Parse) c2toks.get(i)).getSyntacticType());
}
//head word and tag for n2
features.add("mh=" + c2toks.get(i));
features.add("mt=" + ((Parse) c2toks.get(i)).getSyntacticType());
//word/tag pairs
for (i = 0; i < ant.headTokenIndex; i++) {
for (int j = 0; j < mention.headTokenIndex; j++) {
features.add("w=" + c1toks.get(i) + "|" + "w=" + c2toks.get(j));
features.add("w=" + c1toks.get(i) + "|" + "t=" + ((Parse) c2toks.get(j)).getSyntacticType());
features.add("t=" + ((Parse) c1toks.get(i)).getSyntacticType() + "|" + "w=" + c2toks.get(j));
features.add("t=" + ((Parse) c1toks.get(i)).getSyntacticType() + "|" + "t=" + ((Parse) c2toks.get(j)).getSyntacticType());
}
}
features.add("ht=" + ant.headTokenTag + "|" + "ht=" + mention.headTokenTag);
features.add("ht1=" + ant.headTokenTag);
features.add("ht2=" + mention.headTokenTag);
*/
//semantic categories
/*
if (ant.neType != null) {
if (re.neType != null) {
features.add("sc="+ant.neType+","+re.neType);
}
else if (!re.headTokenTag.startsWith("NNP") && re.headTokenTag.startsWith("NN")) {
Set synsets = re.synsets;
for (Iterator si=synsets.iterator();si.hasNext();) {
features.add("sc="+ant.neType+","+si.next());
}
}
}
else if (!ant.headTokenTag.startsWith("NNP") && ant.headTokenTag.startsWith("NN")) {
if (re.neType != null) {
Set synsets = ant.synsets;
for (Iterator si=synsets.iterator();si.hasNext();) {
features.add("sc="+re.neType+","+si.next());
}
}
else if (!re.headTokenTag.startsWith("NNP") && re.headTokenTag.startsWith("NN")) {
//System.err.println("MaxentIsaResolover.getFeatures: both common re="+re.parse+" ant="+ant.parse);
Set synsets1 = ant.synsets;
Set synsets2 = re.synsets;
for (Iterator si=synsets1.iterator();si.hasNext();) {
Object synset = si.next();
if (synsets2.contains(synset)) {
features.add("sc="+synset);
}
}
}
}
}
*/
//System.err.println("MaxentIsResolver.getFeatures: "+features.toString());
return (features);
}
}