/* | |
* Copyright 2013 The Apache Software Foundation. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.addons.geoentitylinker; | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import opennlp.tools.doccat.DoccatModel; | |
import opennlp.tools.doccat.DocumentCategorizerME; | |
import opennlp.tools.entitylinker.EntityLinkerProperties; | |
import opennlp.tools.entitylinker.BaseLink; | |
import opennlp.tools.entitylinker.LinkedSpan; | |
import opennlp.tools.util.Span; | |
import org.apache.log4j.Logger; | |
/** | |
* | |
* Utilizes a doccat model to score toponyms based on surrounding context | |
*/ | |
public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> { | |
private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class); | |
DocumentCategorizerME documentCategorizerME; | |
DoccatModel doccatModel; | |
public static final int RADIUS = 200; | |
boolean modelexists = false; | |
@Override | |
public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) { | |
try { | |
if (doccatModel == null) { | |
String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", ""); | |
if (path.equals("")) { | |
return; | |
} | |
modelexists = true; | |
doccatModel = new DoccatModel(new File(path)); | |
documentCategorizerME = new DocumentCategorizerME(doccatModel); | |
} | |
Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS); | |
for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) { | |
Map<String, Double> scores = this.getScore(entry.getValue()); | |
for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) { | |
double score = 0d; | |
if (scores.containsKey(link.getItemParentID())) { | |
score = scores.get(link.getItemParentID()); | |
} | |
link.getScoreMap().put("countrymodel", score); | |
} | |
} | |
} catch (FileNotFoundException ex) { | |
LOGGER.error(ex); | |
} catch (IOException ex) { | |
LOGGER.error(ex); | |
} catch (Exception ex) { | |
LOGGER.error(ex); | |
} | |
} | |
/** | |
* generates features using a BagOfWordsfeatureGenerator that are within the | |
* radius of a mention within the doctext | |
* | |
* @param linkedSpans | |
* @param sentenceSpans | |
* @param docText | |
* @param radius | |
* @return a map of the index of the linked span to the string of surrounding | |
* text: Map<indexofspan,surrounding text> | |
*/ | |
public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) { | |
Map<Integer, String> featureBags = new HashMap<>(); | |
Map<Integer, Integer> nameMentionMap = new HashMap<>(); | |
/** | |
* iterator over the map that contains a mapping of every country code to | |
* all of its mentions in the document | |
*/ | |
for (int i = 0; i < linkedSpans.size(); i++) { | |
LinkedSpan span = linkedSpans.get(i); | |
if (span.getLinkedEntries().isEmpty()) { | |
//don't care about spans that did not get linked to anything at all; nothing to work with | |
continue; | |
} | |
/** | |
* get the sentence the name span was found in, the beginning of the | |
* sentence will suffice as a centroid for feature generation around the | |
* named entity | |
*/ | |
Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart(); | |
nameMentionMap.put(i, mentionIdx); | |
} | |
/** | |
* now associate each span to a string that will be used for categorization | |
* against the model. | |
*/ | |
for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) { | |
featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius)); | |
} | |
return featureBags; | |
} | |
public String getTextChunk(int mentionIdx, String docText, int radius) { | |
int docSize = docText.length(); | |
int left = 0, right = 0; | |
left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius; | |
right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius; | |
String chunk = ""; | |
if (right <= left) { | |
chunk = ""; | |
} else { | |
/** | |
* don't want to chop any words in half, so take fron the first space to | |
* the last space in the chunk string | |
*/ | |
chunk = docText.substring(left, right); | |
if (left != 0) { | |
left = chunk.indexOf(" "); | |
} | |
right = chunk.lastIndexOf(" "); | |
/** | |
* now get the substring again with only whole words | |
*/ | |
if (left < right) { | |
chunk = chunk.substring(left, right); | |
} | |
} | |
return chunk; | |
} | |
private Map<String, Double> getScore(String text) throws Exception { | |
Map<String, Double> scoreMap = new HashMap<>(); | |
double[] categorize = documentCategorizerME.categorize(text); | |
int catSize = documentCategorizerME.getNumberOfCategories(); | |
for (int i = 0; i < catSize; i++) { | |
String category = documentCategorizerME.getCategory(i); | |
scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]); | |
} | |
return scoreMap; | |
} | |
} |